server: implement ctdb_control_get_public_ip_info()
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = true;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 struct takeover_callback_state {
277         struct ctdb_req_control *c;
278         ctdb_sock_addr *addr;
279         struct ctdb_vnn *vnn;
280 };
281
282 /*
283   called when takeip event finishes
284  */
285 static void takeover_ip_callback(struct ctdb_context *ctdb, int status, 
286                                  void *private_data)
287 {
288         struct takeover_callback_state *state = 
289                 talloc_get_type(private_data, struct takeover_callback_state);
290         struct ctdb_takeover_arp *arp;
291         struct ctdb_tcp_array *tcparray;
292
293         if (status != 0) {
294                 if (status == -ETIME) {
295                         ctdb_ban_self(ctdb);
296                 }
297                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
298                         ctdb_addr_to_str(state->addr),
299                         ctdb_vnn_iface_string(state->vnn)));
300                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
301                 talloc_free(state);
302                 return;
303         }
304
305         if (!state->vnn->takeover_ctx) {
306                 state->vnn->takeover_ctx = talloc_new(state->vnn);
307                 if (!state->vnn->takeover_ctx) {
308                         goto failed;
309                 }
310         }
311
312         arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
313         if (!arp) goto failed;
314         
315         arp->ctdb = ctdb;
316         arp->addr = *state->addr;
317         arp->vnn  = state->vnn;
318
319         tcparray = state->vnn->tcp_array;
320         if (tcparray) {
321                 /* add all of the known tcp connections for this IP to the
322                    list of tcp connections to send tickle acks for */
323                 arp->tcparray = talloc_steal(arp, tcparray);
324
325                 state->vnn->tcp_array = NULL;
326                 state->vnn->tcp_update_needed = true;
327         }
328
329         event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx, 
330                         timeval_zero(), ctdb_control_send_arp, arp);
331
332         /* the control succeeded */
333         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
334         talloc_free(state);
335         return;
336
337 failed:
338         ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
339         talloc_free(state);
340         return;
341 }
342
343 /*
344   Find the vnn of the node that has a public ip address
345   returns -1 if the address is not known as a public address
346  */
347 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
348 {
349         struct ctdb_vnn *vnn;
350
351         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
352                 if (ctdb_same_ip(&vnn->public_address, addr)) {
353                         return vnn;
354                 }
355         }
356
357         return NULL;
358 }
359
360 /*
361   take over an ip address
362  */
363 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
364                                  struct ctdb_req_control *c,
365                                  TDB_DATA indata, 
366                                  bool *async_reply)
367 {
368         int ret;
369         struct takeover_callback_state *state;
370         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
371         struct ctdb_vnn *vnn;
372
373         /* update out vnn list */
374         vnn = find_public_ip_vnn(ctdb, &pip->addr);
375         if (vnn == NULL) {
376                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n", 
377                         ctdb_addr_to_str(&pip->addr)));
378                 return 0;
379         }
380         vnn->pnn = pip->pnn;
381
382         /* if our kernel already has this IP, do nothing */
383         if (ctdb_sys_have_ip(&pip->addr)) {
384                 return 0;
385         }
386
387         ret = ctdb_vnn_assign_iface(ctdb, vnn);
388         if (ret != 0) {
389                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
390                                  "assin a usable interface\n",
391                                  ctdb_addr_to_str(&pip->addr),
392                                  vnn->public_netmask_bits));
393                 return -1;
394         }
395
396         state = talloc(vnn, struct takeover_callback_state);
397         CTDB_NO_MEMORY(ctdb, state);
398
399         state->c = talloc_steal(ctdb, c);
400         state->addr = talloc(ctdb, ctdb_sock_addr);
401         CTDB_NO_MEMORY(ctdb, state->addr);
402
403         *state->addr = pip->addr;
404         state->vnn   = vnn;
405
406         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n", 
407                 ctdb_addr_to_str(&pip->addr),
408                 vnn->public_netmask_bits, 
409                 ctdb_vnn_iface_string(vnn)));
410
411         ret = ctdb_event_script_callback(ctdb, 
412                                          state, takeover_ip_callback, state,
413                                          false,
414                                          CTDB_EVENT_TAKE_IP,
415                                          "%s %s %u",
416                                          ctdb_vnn_iface_string(vnn),
417                                          ctdb_addr_to_str(&pip->addr),
418                                          vnn->public_netmask_bits);
419
420         if (ret != 0) {
421                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
422                         ctdb_addr_to_str(&pip->addr),
423                         ctdb_vnn_iface_string(vnn)));
424                 talloc_free(state);
425                 return -1;
426         }
427
428         /* tell ctdb_control.c that we will be replying asynchronously */
429         *async_reply = true;
430
431         return 0;
432 }
433
434 /*
435   takeover an ip address old v4 style
436  */
437 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
438                                 struct ctdb_req_control *c,
439                                 TDB_DATA indata, 
440                                 bool *async_reply)
441 {
442         TDB_DATA data;
443         
444         data.dsize = sizeof(struct ctdb_public_ip);
445         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
446         CTDB_NO_MEMORY(ctdb, data.dptr);
447         
448         memcpy(data.dptr, indata.dptr, indata.dsize);
449         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
450 }
451
452 /*
453   kill any clients that are registered with a IP that is being released
454  */
455 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
456 {
457         struct ctdb_client_ip *ip;
458
459         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
460                 ctdb_addr_to_str(addr)));
461
462         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
463                 ctdb_sock_addr tmp_addr;
464
465                 tmp_addr = ip->addr;
466                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
467                         ip->client_id,
468                         ctdb_addr_to_str(&ip->addr)));
469
470                 if (ctdb_same_ip(&tmp_addr, addr)) {
471                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
472                                                                      ip->client_id, 
473                                                                      struct ctdb_client);
474                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
475                                 ip->client_id,
476                                 ctdb_addr_to_str(&ip->addr),
477                                 client->pid));
478
479                         if (client->pid != 0) {
480                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
481                                         (unsigned)client->pid,
482                                         ctdb_addr_to_str(addr),
483                                         ip->client_id));
484                                 kill(client->pid, SIGKILL);
485                         }
486                 }
487         }
488 }
489
490 /*
491   called when releaseip event finishes
492  */
493 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
494                                 void *private_data)
495 {
496         struct takeover_callback_state *state = 
497                 talloc_get_type(private_data, struct takeover_callback_state);
498         TDB_DATA data;
499
500         if (status == -ETIME) {
501                 ctdb_ban_self(ctdb);
502         }
503
504         /* send a message to all clients of this node telling them
505            that the cluster has been reconfigured and they should
506            release any sockets on this IP */
507         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
508         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
509         data.dsize = strlen((char *)data.dptr)+1;
510
511         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
512
513         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
514
515         /* kill clients that have registered with this IP */
516         release_kill_clients(ctdb, state->addr);
517
518         ctdb_vnn_unassign_iface(ctdb, state->vnn);
519
520         /* the control succeeded */
521         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
522         talloc_free(state);
523 }
524
525 /*
526   release an ip address
527  */
528 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
529                                 struct ctdb_req_control *c,
530                                 TDB_DATA indata, 
531                                 bool *async_reply)
532 {
533         int ret;
534         struct takeover_callback_state *state;
535         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
536         struct ctdb_vnn *vnn;
537
538         /* update our vnn list */
539         vnn = find_public_ip_vnn(ctdb, &pip->addr);
540         if (vnn == NULL) {
541                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
542                         ctdb_addr_to_str(&pip->addr)));
543                 return 0;
544         }
545         vnn->pnn = pip->pnn;
546
547         /* stop any previous arps */
548         talloc_free(vnn->takeover_ctx);
549         vnn->takeover_ctx = NULL;
550
551         if (!ctdb_sys_have_ip(&pip->addr)) {
552                 DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
553                         ctdb_addr_to_str(&pip->addr),
554                         vnn->public_netmask_bits, 
555                         ctdb_vnn_iface_string(vnn)));
556                 ctdb_vnn_unassign_iface(ctdb, vnn);
557                 return 0;
558         }
559
560         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%u\n", 
561                 ctdb_addr_to_str(&pip->addr),
562                 vnn->public_netmask_bits, 
563                 ctdb_vnn_iface_string(vnn),
564                 pip->pnn));
565
566         state = talloc(ctdb, struct takeover_callback_state);
567         CTDB_NO_MEMORY(ctdb, state);
568
569         state->c = talloc_steal(state, c);
570         state->addr = talloc(state, ctdb_sock_addr);       
571         CTDB_NO_MEMORY(ctdb, state->addr);
572         *state->addr = pip->addr;
573         state->vnn   = vnn;
574
575         ret = ctdb_event_script_callback(ctdb, 
576                                          state, release_ip_callback, state,
577                                          false,
578                                          CTDB_EVENT_RELEASE_IP,
579                                          "%s %s %u",
580                                          ctdb_vnn_iface_string(vnn),
581                                          ctdb_addr_to_str(&pip->addr),
582                                          vnn->public_netmask_bits);
583         if (ret != 0) {
584                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
585                         ctdb_addr_to_str(&pip->addr),
586                         ctdb_vnn_iface_string(vnn)));
587                 talloc_free(state);
588                 return -1;
589         }
590
591         /* tell the control that we will be reply asynchronously */
592         *async_reply = true;
593         return 0;
594 }
595
596 /*
597   release an ip address old v4 style
598  */
599 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
600                                 struct ctdb_req_control *c,
601                                 TDB_DATA indata, 
602                                 bool *async_reply)
603 {
604         TDB_DATA data;
605         
606         data.dsize = sizeof(struct ctdb_public_ip);
607         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
608         CTDB_NO_MEMORY(ctdb, data.dptr);
609         
610         memcpy(data.dptr, indata.dptr, indata.dsize);
611         return ctdb_control_release_ip(ctdb, c, data, async_reply);
612 }
613
614
615 static int ctdb_add_public_address(struct ctdb_context *ctdb,
616                                    ctdb_sock_addr *addr,
617                                    unsigned mask, const char *ifaces)
618 {
619         struct ctdb_vnn      *vnn;
620         uint32_t num = 0;
621         char *tmp;
622         const char *iface;
623         int i;
624         int ret;
625
626         /* Verify that we dont have an entry for this ip yet */
627         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
628                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
629                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
630                                 ctdb_addr_to_str(addr)));
631                         return -1;
632                 }               
633         }
634
635         /* create a new vnn structure for this ip address */
636         vnn = talloc_zero(ctdb, struct ctdb_vnn);
637         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
638         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
639         tmp = talloc_strdup(vnn, ifaces);
640         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
641         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
642                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
643                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
644                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
645                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
646                 num++;
647         }
648         talloc_free(tmp);
649         vnn->ifaces[num] = NULL;
650         vnn->public_address      = *addr;
651         vnn->public_netmask_bits = mask;
652         vnn->pnn                 = -1;
653
654         for (i=0; vnn->ifaces[i]; i++) {
655                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
656                 if (ret != 0) {
657                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
658                                            "for public_address[%s]\n",
659                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
660                         talloc_free(vnn);
661                         return -1;
662                 }
663         }
664
665         DLIST_ADD(ctdb->vnn, vnn);
666
667         return 0;
668 }
669
670 /*
671   setup the event script directory
672 */
673 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
674 {
675         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
676         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
677         return 0;
678 }
679
680 /*
681   setup the public address lists from a file
682 */
683 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
684 {
685         char **lines;
686         int nlines;
687         int i;
688
689         lines = file_lines_load(alist, &nlines, ctdb);
690         if (lines == NULL) {
691                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
692                 return -1;
693         }
694         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
695                 nlines--;
696         }
697
698         for (i=0;i<nlines;i++) {
699                 unsigned mask;
700                 ctdb_sock_addr addr;
701                 const char *addrstr;
702                 const char *ifaces;
703                 char *tok, *line;
704
705                 line = lines[i];
706                 while ((*line == ' ') || (*line == '\t')) {
707                         line++;
708                 }
709                 if (*line == '#') {
710                         continue;
711                 }
712                 if (strcmp(line, "") == 0) {
713                         continue;
714                 }
715                 tok = strtok(line, " \t");
716                 addrstr = tok;
717                 tok = strtok(NULL, " \t");
718                 if (tok == NULL) {
719                         if (NULL == ctdb->default_public_interface) {
720                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
721                                          i+1));
722                                 talloc_free(lines);
723                                 return -1;
724                         }
725                         ifaces = ctdb->default_public_interface;
726                 } else {
727                         ifaces = tok;
728                 }
729
730                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
731                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
732                         talloc_free(lines);
733                         return -1;
734                 }
735                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
736                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
737                         talloc_free(lines);
738                         return -1;
739                 }
740         }
741
742         talloc_free(lines);
743         return 0;
744 }
745
746 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
747                               const char *iface,
748                               const char *ip)
749 {
750         struct ctdb_vnn *svnn;
751         bool ok;
752         int ret;
753
754         svnn = talloc_zero(ctdb, struct ctdb_vnn);
755         CTDB_NO_MEMORY(ctdb, svnn);
756
757         svnn->ifaces = talloc_array(svnn, const char *, 2);
758         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
759         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
760         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
761         svnn->ifaces[1] = NULL;
762
763         ok = parse_ip(ip, iface, 0, &svnn->public_address);
764         if (!ok) {
765                 talloc_free(svnn);
766                 return -1;
767         }
768
769         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
770         if (ret != 0) {
771                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
772                                    "for single_ip[%s]\n",
773                                    svnn->ifaces[0],
774                                    ctdb_addr_to_str(&svnn->public_address)));
775                 talloc_free(svnn);
776                 return -1;
777         }
778
779         ret = ctdb_vnn_assign_iface(ctdb, svnn);
780         if (ret != 0) {
781                 talloc_free(svnn);
782                 return -1;
783         }
784
785         ctdb->single_ip_vnn = svnn;
786         return 0;
787 }
788
789 struct ctdb_public_ip_list {
790         struct ctdb_public_ip_list *next;
791         uint32_t pnn;
792         ctdb_sock_addr addr;
793 };
794
795
796 /* Given a physical node, return the number of
797    public addresses that is currently assigned to this node.
798 */
799 static int node_ip_coverage(struct ctdb_context *ctdb, 
800         int32_t pnn,
801         struct ctdb_public_ip_list *ips)
802 {
803         int num=0;
804
805         for (;ips;ips=ips->next) {
806                 if (ips->pnn == pnn) {
807                         num++;
808                 }
809         }
810         return num;
811 }
812
813
814 /* Check if this is a public ip known to the node, i.e. can that
815    node takeover this ip ?
816 */
817 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
818                 struct ctdb_public_ip_list *ip)
819 {
820         struct ctdb_all_public_ips *public_ips;
821         int i;
822
823         public_ips = ctdb->nodes[pnn]->available_public_ips;
824
825         if (public_ips == NULL) {
826                 return -1;
827         }
828
829         for (i=0;i<public_ips->num;i++) {
830                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
831                         /* yes, this node can serve this public ip */
832                         return 0;
833                 }
834         }
835
836         return -1;
837 }
838
839
840 /* search the node lists list for a node to takeover this ip.
841    pick the node that currently are serving the least number of ips
842    so that the ips get spread out evenly.
843 */
844 static int find_takeover_node(struct ctdb_context *ctdb, 
845                 struct ctdb_node_map *nodemap, uint32_t mask, 
846                 struct ctdb_public_ip_list *ip,
847                 struct ctdb_public_ip_list *all_ips)
848 {
849         int pnn, min=0, num;
850         int i;
851
852         pnn    = -1;
853         for (i=0;i<nodemap->num;i++) {
854                 if (nodemap->nodes[i].flags & mask) {
855                         /* This node is not healty and can not be used to serve
856                            a public address 
857                         */
858                         continue;
859                 }
860
861                 /* verify that this node can serve this ip */
862                 if (can_node_serve_ip(ctdb, i, ip)) {
863                         /* no it couldnt   so skip to the next node */
864                         continue;
865                 }
866
867                 num = node_ip_coverage(ctdb, i, all_ips);
868                 /* was this the first node we checked ? */
869                 if (pnn == -1) {
870                         pnn = i;
871                         min  = num;
872                 } else {
873                         if (num < min) {
874                                 pnn = i;
875                                 min  = num;
876                         }
877                 }
878         }       
879         if (pnn == -1) {
880                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
881                         ctdb_addr_to_str(&ip->addr)));
882
883                 return -1;
884         }
885
886         ip->pnn = pnn;
887         return 0;
888 }
889
890 #define IP_KEYLEN       4
891 static uint32_t *ip_key(ctdb_sock_addr *ip)
892 {
893         static uint32_t key[IP_KEYLEN];
894
895         bzero(key, sizeof(key));
896
897         switch (ip->sa.sa_family) {
898         case AF_INET:
899                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
900                 break;
901         case AF_INET6:
902                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
903                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
904                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
905                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
906                 break;
907         default:
908                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
909                 return key;
910         }
911
912         return key;
913 }
914
915 static void *add_ip_callback(void *parm, void *data)
916 {
917         return parm;
918 }
919
920 void getips_count_callback(void *param, void *data)
921 {
922         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
923         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
924
925         new_ip->next = *ip_list;
926         *ip_list     = new_ip;
927 }
928
929 struct ctdb_public_ip_list *
930 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
931 {
932         int i, j;
933         struct ctdb_public_ip_list *ip_list;
934         struct ctdb_all_public_ips *public_ips;
935         trbt_tree_t *ip_tree;
936
937         ip_tree = trbt_create(tmp_ctx, 0);
938
939         for (i=0;i<ctdb->num_nodes;i++) {
940                 public_ips = ctdb->nodes[i]->known_public_ips;
941
942                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
943                         continue;
944                 }
945
946                 /* there were no public ips for this node */
947                 if (public_ips == NULL) {
948                         continue;
949                 }               
950
951                 for (j=0;j<public_ips->num;j++) {
952                         struct ctdb_public_ip_list *tmp_ip; 
953
954                         tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
955                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
956                         tmp_ip->pnn  = public_ips->ips[j].pnn;
957                         tmp_ip->addr = public_ips->ips[j].addr;
958                         tmp_ip->next = NULL;
959
960                         trbt_insertarray32_callback(ip_tree,
961                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
962                                 add_ip_callback,
963                                 tmp_ip);
964                 }
965         }
966
967         ip_list = NULL;
968         trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
969
970         return ip_list;
971 }
972
973 /*
974   make any IP alias changes for public addresses that are necessary 
975  */
976 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
977 {
978         int i, num_healthy, retries;
979         struct ctdb_public_ip ip;
980         struct ctdb_public_ipv4 ipv4;
981         uint32_t mask;
982         struct ctdb_public_ip_list *all_ips, *tmp_ip;
983         int maxnode, maxnum=0, minnode, minnum=0, num;
984         TDB_DATA data;
985         struct timeval timeout;
986         struct client_async_data *async_data;
987         struct ctdb_client_control_state *state;
988         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
989
990
991         ZERO_STRUCT(ip);
992
993         /* Count how many completely healthy nodes we have */
994         num_healthy = 0;
995         for (i=0;i<nodemap->num;i++) {
996                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
997                         num_healthy++;
998                 }
999         }
1000
1001         if (num_healthy > 0) {
1002                 /* We have healthy nodes, so only consider them for 
1003                    serving public addresses
1004                 */
1005                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1006         } else {
1007                 /* We didnt have any completely healthy nodes so
1008                    use "disabled" nodes as a fallback
1009                 */
1010                 mask = NODE_FLAGS_INACTIVE;
1011         }
1012
1013         /* since nodes only know about those public addresses that
1014            can be served by that particular node, no single node has
1015            a full list of all public addresses that exist in the cluster.
1016            Walk over all node structures and create a merged list of
1017            all public addresses that exist in the cluster.
1018         */
1019         all_ips = create_merged_ip_list(ctdb, tmp_ctx);
1020
1021         /* If we want deterministic ip allocations, i.e. that the ip addresses
1022            will always be allocated the same way for a specific set of
1023            available/unavailable nodes.
1024         */
1025         if (1 == ctdb->tunable.deterministic_public_ips) {              
1026                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1027                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1028                         tmp_ip->pnn = i%nodemap->num;
1029                 }
1030         }
1031
1032
1033         /* mark all public addresses with a masked node as being served by
1034            node -1
1035         */
1036         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1037                 if (tmp_ip->pnn == -1) {
1038                         continue;
1039                 }
1040                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1041                         tmp_ip->pnn = -1;
1042                 }
1043         }
1044
1045         /* verify that the assigned nodes can serve that public ip
1046            and set it to -1 if not
1047         */
1048         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1049                 if (tmp_ip->pnn == -1) {
1050                         continue;
1051                 }
1052                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1053                         /* this node can not serve this ip. */
1054                         tmp_ip->pnn = -1;
1055                 }
1056         }
1057
1058
1059         /* now we must redistribute all public addresses with takeover node
1060            -1 among the nodes available
1061         */
1062         retries = 0;
1063 try_again:
1064         /* loop over all ip's and find a physical node to cover for 
1065            each unassigned ip.
1066         */
1067         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1068                 if (tmp_ip->pnn == -1) {
1069                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1070                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1071                                         ctdb_addr_to_str(&tmp_ip->addr)));
1072                         }
1073                 }
1074         }
1075
1076         /* If we dont want ips to fail back after a node becomes healthy
1077            again, we wont even try to reallocat the ip addresses so that
1078            they are evenly spread out.
1079            This can NOT be used at the same time as DeterministicIPs !
1080         */
1081         if (1 == ctdb->tunable.no_ip_failback) {
1082                 if (1 == ctdb->tunable.deterministic_public_ips) {
1083                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1084                 }
1085                 goto finished;
1086         }
1087
1088
1089         /* now, try to make sure the ip adresses are evenly distributed
1090            across the node.
1091            for each ip address, loop over all nodes that can serve this
1092            ip and make sure that the difference between the node
1093            serving the most and the node serving the least ip's are not greater
1094            than 1.
1095         */
1096         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1097                 if (tmp_ip->pnn == -1) {
1098                         continue;
1099                 }
1100
1101                 /* Get the highest and lowest number of ips's served by any 
1102                    valid node which can serve this ip.
1103                 */
1104                 maxnode = -1;
1105                 minnode = -1;
1106                 for (i=0;i<nodemap->num;i++) {
1107                         if (nodemap->nodes[i].flags & mask) {
1108                                 continue;
1109                         }
1110
1111                         /* only check nodes that can actually serve this ip */
1112                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1113                                 /* no it couldnt   so skip to the next node */
1114                                 continue;
1115                         }
1116
1117                         num = node_ip_coverage(ctdb, i, all_ips);
1118                         if (maxnode == -1) {
1119                                 maxnode = i;
1120                                 maxnum  = num;
1121                         } else {
1122                                 if (num > maxnum) {
1123                                         maxnode = i;
1124                                         maxnum  = num;
1125                                 }
1126                         }
1127                         if (minnode == -1) {
1128                                 minnode = i;
1129                                 minnum  = num;
1130                         } else {
1131                                 if (num < minnum) {
1132                                         minnode = i;
1133                                         minnum  = num;
1134                                 }
1135                         }
1136                 }
1137                 if (maxnode == -1) {
1138                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1139                                 ctdb_addr_to_str(&tmp_ip->addr)));
1140
1141                         continue;
1142                 }
1143
1144                 /* If we want deterministic IPs then dont try to reallocate 
1145                    them to spread out the load.
1146                 */
1147                 if (1 == ctdb->tunable.deterministic_public_ips) {
1148                         continue;
1149                 }
1150
1151                 /* if the spread between the smallest and largest coverage by
1152                    a node is >=2 we steal one of the ips from the node with
1153                    most coverage to even things out a bit.
1154                    try to do this at most 5 times  since we dont want to spend
1155                    too much time balancing the ip coverage.
1156                 */
1157                 if ( (maxnum > minnum+1)
1158                   && (retries < 5) ){
1159                         struct ctdb_public_ip_list *tmp;
1160
1161                         /* mark one of maxnode's vnn's as unassigned and try
1162                            again
1163                         */
1164                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1165                                 if (tmp->pnn == maxnode) {
1166                                         tmp->pnn = -1;
1167                                         retries++;
1168                                         goto try_again;
1169                                 }
1170                         }
1171                 }
1172         }
1173
1174
1175         /* finished distributing the public addresses, now just send the 
1176            info out to the nodes
1177         */
1178 finished:
1179
1180         /* at this point ->pnn is the node which will own each IP
1181            or -1 if there is no node that can cover this ip
1182         */
1183
1184         /* now tell all nodes to delete any alias that they should not
1185            have.  This will be a NOOP on nodes that don't currently
1186            hold the given alias */
1187         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1188         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1189
1190         for (i=0;i<nodemap->num;i++) {
1191                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1192                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1193                         continue;
1194                 }
1195
1196                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1197                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1198                                 /* This node should be serving this
1199                                    vnn so dont tell it to release the ip
1200                                 */
1201                                 continue;
1202                         }
1203                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1204                                 ipv4.pnn = tmp_ip->pnn;
1205                                 ipv4.sin = tmp_ip->addr.ip;
1206
1207                                 timeout = TAKEOVER_TIMEOUT();
1208                                 data.dsize = sizeof(ipv4);
1209                                 data.dptr  = (uint8_t *)&ipv4;
1210                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1211                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1212                                                 data, async_data,
1213                                                 &timeout, NULL);
1214                         } else {
1215                                 ip.pnn  = tmp_ip->pnn;
1216                                 ip.addr = tmp_ip->addr;
1217
1218                                 timeout = TAKEOVER_TIMEOUT();
1219                                 data.dsize = sizeof(ip);
1220                                 data.dptr  = (uint8_t *)&ip;
1221                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1222                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1223                                                 data, async_data,
1224                                                 &timeout, NULL);
1225                         }
1226
1227                         if (state == NULL) {
1228                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1229                                 talloc_free(tmp_ctx);
1230                                 return -1;
1231                         }
1232                 
1233                         ctdb_client_async_add(async_data, state);
1234                 }
1235         }
1236         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1237                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1238                 talloc_free(tmp_ctx);
1239                 return -1;
1240         }
1241         talloc_free(async_data);
1242
1243
1244         /* tell all nodes to get their own IPs */
1245         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1246         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1247         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1248                 if (tmp_ip->pnn == -1) {
1249                         /* this IP won't be taken over */
1250                         continue;
1251                 }
1252
1253                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1254                         ipv4.pnn = tmp_ip->pnn;
1255                         ipv4.sin = tmp_ip->addr.ip;
1256
1257                         timeout = TAKEOVER_TIMEOUT();
1258                         data.dsize = sizeof(ipv4);
1259                         data.dptr  = (uint8_t *)&ipv4;
1260                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1261                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1262                                         data, async_data,
1263                                         &timeout, NULL);
1264                 } else {
1265                         ip.pnn  = tmp_ip->pnn;
1266                         ip.addr = tmp_ip->addr;
1267
1268                         timeout = TAKEOVER_TIMEOUT();
1269                         data.dsize = sizeof(ip);
1270                         data.dptr  = (uint8_t *)&ip;
1271                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1272                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1273                                         data, async_data,
1274                                         &timeout, NULL);
1275                 }
1276                 if (state == NULL) {
1277                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1278                         talloc_free(tmp_ctx);
1279                         return -1;
1280                 }
1281                 
1282                 ctdb_client_async_add(async_data, state);
1283         }
1284         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1285                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1286                 talloc_free(tmp_ctx);
1287                 return -1;
1288         }
1289
1290         talloc_free(tmp_ctx);
1291         return 0;
1292 }
1293
1294
1295 /*
1296   destroy a ctdb_client_ip structure
1297  */
1298 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1299 {
1300         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1301                 ctdb_addr_to_str(&ip->addr),
1302                 ntohs(ip->addr.ip.sin_port),
1303                 ip->client_id));
1304
1305         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1306         return 0;
1307 }
1308
1309 /*
1310   called by a client to inform us of a TCP connection that it is managing
1311   that should tickled with an ACK when IP takeover is done
1312   we handle both the old ipv4 style of packets as well as the new ipv4/6
1313   pdus.
1314  */
1315 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1316                                 TDB_DATA indata)
1317 {
1318         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1319         struct ctdb_control_tcp *old_addr = NULL;
1320         struct ctdb_control_tcp_addr new_addr;
1321         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1322         struct ctdb_tcp_list *tcp;
1323         struct ctdb_control_tcp_vnn t;
1324         int ret;
1325         TDB_DATA data;
1326         struct ctdb_client_ip *ip;
1327         struct ctdb_vnn *vnn;
1328         ctdb_sock_addr addr;
1329
1330         switch (indata.dsize) {
1331         case sizeof(struct ctdb_control_tcp):
1332                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1333                 ZERO_STRUCT(new_addr);
1334                 tcp_sock = &new_addr;
1335                 tcp_sock->src.ip  = old_addr->src;
1336                 tcp_sock->dest.ip = old_addr->dest;
1337                 break;
1338         case sizeof(struct ctdb_control_tcp_addr):
1339                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1340                 break;
1341         default:
1342                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1343                                  "to ctdb_control_tcp_client. size was %d but "
1344                                  "only allowed sizes are %lu and %lu\n",
1345                                  (int)indata.dsize,
1346                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1347                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1348                 return -1;
1349         }
1350
1351         addr = tcp_sock->src;
1352         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1353         addr = tcp_sock->dest;
1354         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1355
1356         ZERO_STRUCT(addr);
1357         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1358         vnn = find_public_ip_vnn(ctdb, &addr);
1359         if (vnn == NULL) {
1360                 switch (addr.sa.sa_family) {
1361                 case AF_INET:
1362                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1363                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1364                                         ctdb_addr_to_str(&addr)));
1365                         }
1366                         break;
1367                 case AF_INET6:
1368                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1369                                 ctdb_addr_to_str(&addr)));
1370                         break;
1371                 default:
1372                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1373                 }
1374
1375                 return 0;
1376         }
1377
1378         if (vnn->pnn != ctdb->pnn) {
1379                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1380                         ctdb_addr_to_str(&addr),
1381                         client_id, client->pid));
1382                 /* failing this call will tell smbd to die */
1383                 return -1;
1384         }
1385
1386         ip = talloc(client, struct ctdb_client_ip);
1387         CTDB_NO_MEMORY(ctdb, ip);
1388
1389         ip->ctdb      = ctdb;
1390         ip->addr      = addr;
1391         ip->client_id = client_id;
1392         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1393         DLIST_ADD(ctdb->client_ip_list, ip);
1394
1395         tcp = talloc(client, struct ctdb_tcp_list);
1396         CTDB_NO_MEMORY(ctdb, tcp);
1397
1398         tcp->connection.src_addr = tcp_sock->src;
1399         tcp->connection.dst_addr = tcp_sock->dest;
1400
1401         DLIST_ADD(client->tcp_list, tcp);
1402
1403         t.src  = tcp_sock->src;
1404         t.dest = tcp_sock->dest;
1405
1406         data.dptr = (uint8_t *)&t;
1407         data.dsize = sizeof(t);
1408
1409         switch (addr.sa.sa_family) {
1410         case AF_INET:
1411                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1412                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1413                         ctdb_addr_to_str(&tcp_sock->src),
1414                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1415                 break;
1416         case AF_INET6:
1417                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1418                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1419                         ctdb_addr_to_str(&tcp_sock->src),
1420                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1421                 break;
1422         default:
1423                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1424         }
1425
1426
1427         /* tell all nodes about this tcp connection */
1428         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1429                                        CTDB_CONTROL_TCP_ADD,
1430                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1431         if (ret != 0) {
1432                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1433                 return -1;
1434         }
1435
1436         return 0;
1437 }
1438
1439 /*
1440   find a tcp address on a list
1441  */
1442 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1443                                            struct ctdb_tcp_connection *tcp)
1444 {
1445         int i;
1446
1447         if (array == NULL) {
1448                 return NULL;
1449         }
1450
1451         for (i=0;i<array->num;i++) {
1452                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1453                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1454                         return &array->connections[i];
1455                 }
1456         }
1457         return NULL;
1458 }
1459
1460 /*
1461   called by a daemon to inform us of a TCP connection that one of its
1462   clients managing that should tickled with an ACK when IP takeover is
1463   done
1464  */
1465 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1466 {
1467         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1468         struct ctdb_tcp_array *tcparray;
1469         struct ctdb_tcp_connection tcp;
1470         struct ctdb_vnn *vnn;
1471
1472         vnn = find_public_ip_vnn(ctdb, &p->dest);
1473         if (vnn == NULL) {
1474                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1475                         ctdb_addr_to_str(&p->dest)));
1476
1477                 return -1;
1478         }
1479
1480
1481         tcparray = vnn->tcp_array;
1482
1483         /* If this is the first tickle */
1484         if (tcparray == NULL) {
1485                 tcparray = talloc_size(ctdb->nodes, 
1486                         offsetof(struct ctdb_tcp_array, connections) +
1487                         sizeof(struct ctdb_tcp_connection) * 1);
1488                 CTDB_NO_MEMORY(ctdb, tcparray);
1489                 vnn->tcp_array = tcparray;
1490
1491                 tcparray->num = 0;
1492                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1493                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1494
1495                 tcparray->connections[tcparray->num].src_addr = p->src;
1496                 tcparray->connections[tcparray->num].dst_addr = p->dest;
1497                 tcparray->num++;
1498                 return 0;
1499         }
1500
1501
1502         /* Do we already have this tickle ?*/
1503         tcp.src_addr = p->src;
1504         tcp.dst_addr = p->dest;
1505         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1506                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1507                         ctdb_addr_to_str(&tcp.dst_addr),
1508                         ntohs(tcp.dst_addr.ip.sin_port),
1509                         vnn->pnn));
1510                 return 0;
1511         }
1512
1513         /* A new tickle, we must add it to the array */
1514         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1515                                         struct ctdb_tcp_connection,
1516                                         tcparray->num+1);
1517         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1518
1519         vnn->tcp_array = tcparray;
1520         tcparray->connections[tcparray->num].src_addr = p->src;
1521         tcparray->connections[tcparray->num].dst_addr = p->dest;
1522         tcparray->num++;
1523                                 
1524         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1525                 ctdb_addr_to_str(&tcp.dst_addr),
1526                 ntohs(tcp.dst_addr.ip.sin_port),
1527                 vnn->pnn));
1528
1529         return 0;
1530 }
1531
1532
1533 /*
1534   called by a daemon to inform us of a TCP connection that one of its
1535   clients managing that should tickled with an ACK when IP takeover is
1536   done
1537  */
1538 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1539 {
1540         struct ctdb_tcp_connection *tcpp;
1541         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1542
1543         if (vnn == NULL) {
1544                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1545                         ctdb_addr_to_str(&conn->dst_addr)));
1546                 return;
1547         }
1548
1549         /* if the array is empty we cant remove it
1550            and we dont need to do anything
1551          */
1552         if (vnn->tcp_array == NULL) {
1553                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1554                         ctdb_addr_to_str(&conn->dst_addr),
1555                         ntohs(conn->dst_addr.ip.sin_port)));
1556                 return;
1557         }
1558
1559
1560         /* See if we know this connection
1561            if we dont know this connection  then we dont need to do anything
1562          */
1563         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1564         if (tcpp == NULL) {
1565                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1566                         ctdb_addr_to_str(&conn->dst_addr),
1567                         ntohs(conn->dst_addr.ip.sin_port)));
1568                 return;
1569         }
1570
1571
1572         /* We need to remove this entry from the array.
1573            Instead of allocating a new array and copying data to it
1574            we cheat and just copy the last entry in the existing array
1575            to the entry that is to be removed and just shring the 
1576            ->num field
1577          */
1578         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1579         vnn->tcp_array->num--;
1580
1581         /* If we deleted the last entry we also need to remove the entire array
1582          */
1583         if (vnn->tcp_array->num == 0) {
1584                 talloc_free(vnn->tcp_array);
1585                 vnn->tcp_array = NULL;
1586         }               
1587
1588         vnn->tcp_update_needed = true;
1589
1590         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1591                 ctdb_addr_to_str(&conn->src_addr),
1592                 ntohs(conn->src_addr.ip.sin_port)));
1593 }
1594
1595
1596 /*
1597   called when a daemon restarts - send all tickes for all public addresses
1598   we are serving immediately to the new node.
1599  */
1600 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1601 {
1602 /*XXX here we should send all tickes we are serving to the new node */
1603         return 0;
1604 }
1605
1606
1607 /*
1608   called when a client structure goes away - hook to remove
1609   elements from the tcp_list in all daemons
1610  */
1611 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1612 {
1613         while (client->tcp_list) {
1614                 struct ctdb_tcp_list *tcp = client->tcp_list;
1615                 DLIST_REMOVE(client->tcp_list, tcp);
1616                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1617         }
1618 }
1619
1620
1621 /*
1622   release all IPs on shutdown
1623  */
1624 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1625 {
1626         struct ctdb_vnn *vnn;
1627
1628         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1629                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1630                         ctdb_vnn_unassign_iface(ctdb, vnn);
1631                         continue;
1632                 }
1633                 if (!vnn->iface) {
1634                         continue;
1635                 }
1636                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1637                                   ctdb_vnn_iface_string(vnn),
1638                                   ctdb_addr_to_str(&vnn->public_address),
1639                                   vnn->public_netmask_bits);
1640                 release_kill_clients(ctdb, &vnn->public_address);
1641                 ctdb_vnn_unassign_iface(ctdb, vnn);
1642         }
1643 }
1644
1645
1646 /*
1647   get list of public IPs
1648  */
1649 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1650                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1651 {
1652         int i, num, len;
1653         struct ctdb_all_public_ips *ips;
1654         struct ctdb_vnn *vnn;
1655         bool only_available = false;
1656
1657         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1658                 only_available = true;
1659         }
1660
1661         /* count how many public ip structures we have */
1662         num = 0;
1663         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1664                 num++;
1665         }
1666
1667         len = offsetof(struct ctdb_all_public_ips, ips) + 
1668                 num*sizeof(struct ctdb_public_ip);
1669         ips = talloc_zero_size(outdata, len);
1670         CTDB_NO_MEMORY(ctdb, ips);
1671
1672         i = 0;
1673         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1674                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1675                         continue;
1676                 }
1677                 ips->ips[i].pnn  = vnn->pnn;
1678                 ips->ips[i].addr = vnn->public_address;
1679                 i++;
1680         }
1681         ips->num = i;
1682         len = offsetof(struct ctdb_all_public_ips, ips) +
1683                 i*sizeof(struct ctdb_public_ip);
1684
1685         outdata->dsize = len;
1686         outdata->dptr  = (uint8_t *)ips;
1687
1688         return 0;
1689 }
1690
1691
1692 /*
1693   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
1694  */
1695 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
1696                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1697 {
1698         int i, num, len;
1699         struct ctdb_all_public_ipsv4 *ips;
1700         struct ctdb_vnn *vnn;
1701
1702         /* count how many public ip structures we have */
1703         num = 0;
1704         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1705                 if (vnn->public_address.sa.sa_family != AF_INET) {
1706                         continue;
1707                 }
1708                 num++;
1709         }
1710
1711         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
1712                 num*sizeof(struct ctdb_public_ipv4);
1713         ips = talloc_zero_size(outdata, len);
1714         CTDB_NO_MEMORY(ctdb, ips);
1715
1716         outdata->dsize = len;
1717         outdata->dptr  = (uint8_t *)ips;
1718
1719         ips->num = num;
1720         i = 0;
1721         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1722                 if (vnn->public_address.sa.sa_family != AF_INET) {
1723                         continue;
1724                 }
1725                 ips->ips[i].pnn = vnn->pnn;
1726                 ips->ips[i].sin = vnn->public_address.ip;
1727                 i++;
1728         }
1729
1730         return 0;
1731 }
1732
1733 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
1734                                         struct ctdb_req_control *c,
1735                                         TDB_DATA indata,
1736                                         TDB_DATA *outdata)
1737 {
1738         int i, num, len;
1739         ctdb_sock_addr *addr;
1740         struct ctdb_control_public_ip_info *info;
1741         struct ctdb_vnn *vnn;
1742
1743         addr = (ctdb_sock_addr *)indata.dptr;
1744
1745         vnn = find_public_ip_vnn(ctdb, addr);
1746         if (vnn == NULL) {
1747                 /* if it is not a public ip   it could be our 'single ip' */
1748                 if (ctdb->single_ip_vnn) {
1749                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
1750                                 vnn = ctdb->single_ip_vnn;
1751                         }
1752                 }
1753         }
1754         if (vnn == NULL) {
1755                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
1756                                  "'%s'not a public address\n",
1757                                  ctdb_addr_to_str(addr)));
1758                 return -1;
1759         }
1760
1761         /* count how many public ip structures we have */
1762         num = 0;
1763         for (;vnn->ifaces[num];) {
1764                 num++;
1765         }
1766
1767         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1768                 num*sizeof(struct ctdb_control_iface_info);
1769         info = talloc_zero_size(outdata, len);
1770         CTDB_NO_MEMORY(ctdb, info);
1771
1772         info->ip.addr = vnn->public_address;
1773         info->ip.pnn = vnn->pnn;
1774         info->active_idx = 0xFFFFFFFF;
1775
1776         for (i=0; vnn->ifaces[i]; i++) {
1777                 struct ctdb_iface *cur;
1778
1779                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
1780                 if (cur == NULL) {
1781                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
1782                                            vnn->ifaces[i]));
1783                         return -1;
1784                 }
1785                 if (vnn->iface == cur) {
1786                         info->active_idx = i;
1787                 }
1788                 strcpy(info->ifaces[i].name, cur->name);
1789                 info->ifaces[i].link_state = cur->link_up;
1790                 info->ifaces[i].references = cur->references;
1791         }
1792         info->num = i;
1793         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1794                 i*sizeof(struct ctdb_control_iface_info);
1795
1796         outdata->dsize = len;
1797         outdata->dptr  = (uint8_t *)info;
1798
1799         return 0;
1800 }
1801
1802 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
1803                                 struct ctdb_req_control *c,
1804                                 TDB_DATA *outdata)
1805 {
1806         return -1;
1807 }
1808
1809 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
1810                                     struct ctdb_req_control *c,
1811                                     TDB_DATA indata)
1812 {
1813         return -1;
1814 }
1815
1816
1817 /* 
1818    structure containing the listening socket and the list of tcp connections
1819    that the ctdb daemon is to kill
1820 */
1821 struct ctdb_kill_tcp {
1822         struct ctdb_vnn *vnn;
1823         struct ctdb_context *ctdb;
1824         int capture_fd;
1825         struct fd_event *fde;
1826         trbt_tree_t *connections;
1827         void *private_data;
1828 };
1829
1830 /*
1831   a tcp connection that is to be killed
1832  */
1833 struct ctdb_killtcp_con {
1834         ctdb_sock_addr src_addr;
1835         ctdb_sock_addr dst_addr;
1836         int count;
1837         struct ctdb_kill_tcp *killtcp;
1838 };
1839
1840 /* this function is used to create a key to represent this socketpair
1841    in the killtcp tree.
1842    this key is used to insert and lookup matching socketpairs that are
1843    to be tickled and RST
1844 */
1845 #define KILLTCP_KEYLEN  10
1846 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1847 {
1848         static uint32_t key[KILLTCP_KEYLEN];
1849
1850         bzero(key, sizeof(key));
1851
1852         if (src->sa.sa_family != dst->sa.sa_family) {
1853                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1854                 return key;
1855         }
1856         
1857         switch (src->sa.sa_family) {
1858         case AF_INET:
1859                 key[0]  = dst->ip.sin_addr.s_addr;
1860                 key[1]  = src->ip.sin_addr.s_addr;
1861                 key[2]  = dst->ip.sin_port;
1862                 key[3]  = src->ip.sin_port;
1863                 break;
1864         case AF_INET6:
1865                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
1866                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
1867                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
1868                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
1869                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
1870                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
1871                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
1872                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
1873                 key[8]  = dst->ip6.sin6_port;
1874                 key[9]  = src->ip6.sin6_port;
1875                 break;
1876         default:
1877                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1878                 return key;
1879         }
1880
1881         return key;
1882 }
1883
1884 /*
1885   called when we get a read event on the raw socket
1886  */
1887 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
1888                                 uint16_t flags, void *private_data)
1889 {
1890         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1891         struct ctdb_killtcp_con *con;
1892         ctdb_sock_addr src, dst;
1893         uint32_t ack_seq, seq;
1894
1895         if (!(flags & EVENT_FD_READ)) {
1896                 return;
1897         }
1898
1899         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1900                                 killtcp->private_data,
1901                                 &src, &dst,
1902                                 &ack_seq, &seq) != 0) {
1903                 /* probably a non-tcp ACK packet */
1904                 return;
1905         }
1906
1907         /* check if we have this guy in our list of connections
1908            to kill
1909         */
1910         con = trbt_lookuparray32(killtcp->connections, 
1911                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1912         if (con == NULL) {
1913                 /* no this was some other packet we can just ignore */
1914                 return;
1915         }
1916
1917         /* This one has been tickled !
1918            now reset him and remove him from the list.
1919          */
1920         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1921                 ntohs(con->dst_addr.ip.sin_port),
1922                 ctdb_addr_to_str(&con->src_addr),
1923                 ntohs(con->src_addr.ip.sin_port)));
1924
1925         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1926         talloc_free(con);
1927 }
1928
1929
1930 /* when traversing the list of all tcp connections to send tickle acks to
1931    (so that we can capture the ack coming back and kill the connection
1932     by a RST)
1933    this callback is called for each connection we are currently trying to kill
1934 */
1935 static void tickle_connection_traverse(void *param, void *data)
1936 {
1937         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1938
1939         /* have tried too many times, just give up */
1940         if (con->count >= 5) {
1941                 talloc_free(con);
1942                 return;
1943         }
1944
1945         /* othervise, try tickling it again */
1946         con->count++;
1947         ctdb_sys_send_tcp(
1948                 (ctdb_sock_addr *)&con->dst_addr,
1949                 (ctdb_sock_addr *)&con->src_addr,
1950                 0, 0, 0);
1951 }
1952
1953
1954 /* 
1955    called every second until all sentenced connections have been reset
1956  */
1957 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
1958                                               struct timeval t, void *private_data)
1959 {
1960         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1961
1962
1963         /* loop over all connections sending tickle ACKs */
1964         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
1965
1966
1967         /* If there are no more connections to kill we can remove the
1968            entire killtcp structure
1969          */
1970         if ( (killtcp->connections == NULL) || 
1971              (killtcp->connections->root == NULL) ) {
1972                 talloc_free(killtcp);
1973                 return;
1974         }
1975
1976         /* try tickling them again in a seconds time
1977          */
1978         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
1979                         ctdb_tickle_sentenced_connections, killtcp);
1980 }
1981
1982 /*
1983   destroy the killtcp structure
1984  */
1985 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1986 {
1987         killtcp->vnn->killtcp = NULL;
1988         return 0;
1989 }
1990
1991
1992 /* nothing fancy here, just unconditionally replace any existing
1993    connection structure with the new one.
1994
1995    dont even free the old one if it did exist, that one is talloc_stolen
1996    by the same node in the tree anyway and will be deleted when the new data 
1997    is deleted
1998 */
1999 static void *add_killtcp_callback(void *parm, void *data)
2000 {
2001         return parm;
2002 }
2003
2004 /*
2005   add a tcp socket to the list of connections we want to RST
2006  */
2007 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2008                                        ctdb_sock_addr *s,
2009                                        ctdb_sock_addr *d)
2010 {
2011         ctdb_sock_addr src, dst;
2012         struct ctdb_kill_tcp *killtcp;
2013         struct ctdb_killtcp_con *con;
2014         struct ctdb_vnn *vnn;
2015
2016         ctdb_canonicalize_ip(s, &src);
2017         ctdb_canonicalize_ip(d, &dst);
2018
2019         vnn = find_public_ip_vnn(ctdb, &dst);
2020         if (vnn == NULL) {
2021                 vnn = find_public_ip_vnn(ctdb, &src);
2022         }
2023         if (vnn == NULL) {
2024                 /* if it is not a public ip   it could be our 'single ip' */
2025                 if (ctdb->single_ip_vnn) {
2026                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2027                                 vnn = ctdb->single_ip_vnn;
2028                         }
2029                 }
2030         }
2031         if (vnn == NULL) {
2032                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2033                 return -1;
2034         }
2035
2036         killtcp = vnn->killtcp;
2037         
2038         /* If this is the first connection to kill we must allocate
2039            a new structure
2040          */
2041         if (killtcp == NULL) {
2042                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2043                 CTDB_NO_MEMORY(ctdb, killtcp);
2044
2045                 killtcp->vnn         = vnn;
2046                 killtcp->ctdb        = ctdb;
2047                 killtcp->capture_fd  = -1;
2048                 killtcp->connections = trbt_create(killtcp, 0);
2049
2050                 vnn->killtcp         = killtcp;
2051                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2052         }
2053
2054
2055
2056         /* create a structure that describes this connection we want to
2057            RST and store it in killtcp->connections
2058         */
2059         con = talloc(killtcp, struct ctdb_killtcp_con);
2060         CTDB_NO_MEMORY(ctdb, con);
2061         con->src_addr = src;
2062         con->dst_addr = dst;
2063         con->count    = 0;
2064         con->killtcp  = killtcp;
2065
2066
2067         trbt_insertarray32_callback(killtcp->connections,
2068                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2069                         add_killtcp_callback, con);
2070
2071         /* 
2072            If we dont have a socket to listen on yet we must create it
2073          */
2074         if (killtcp->capture_fd == -1) {
2075                 const char *iface = ctdb_vnn_iface_string(vnn);
2076                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2077                 if (killtcp->capture_fd == -1) {
2078                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2079                                           "socket on iface '%s' for killtcp (%s)\n",
2080                                           iface, strerror(errno)));
2081                         goto failed;
2082                 }
2083         }
2084
2085
2086         if (killtcp->fde == NULL) {
2087                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2088                                             EVENT_FD_READ | EVENT_FD_AUTOCLOSE, 
2089                                             capture_tcp_handler, killtcp);
2090
2091                 /* We also need to set up some events to tickle all these connections
2092                    until they are all reset
2093                 */
2094                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2095                                 ctdb_tickle_sentenced_connections, killtcp);
2096         }
2097
2098         /* tickle him once now */
2099         ctdb_sys_send_tcp(
2100                 &con->dst_addr,
2101                 &con->src_addr,
2102                 0, 0, 0);
2103
2104         return 0;
2105
2106 failed:
2107         talloc_free(vnn->killtcp);
2108         vnn->killtcp = NULL;
2109         return -1;
2110 }
2111
2112 /*
2113   kill a TCP connection.
2114  */
2115 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2116 {
2117         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2118
2119         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2120 }
2121
2122 /*
2123   called by a daemon to inform us of the entire list of TCP tickles for
2124   a particular public address.
2125   this control should only be sent by the node that is currently serving
2126   that public address.
2127  */
2128 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2129 {
2130         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2131         struct ctdb_tcp_array *tcparray;
2132         struct ctdb_vnn *vnn;
2133
2134         /* We must at least have tickles.num or else we cant verify the size
2135            of the received data blob
2136          */
2137         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2138                                         tickles.connections)) {
2139                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2140                 return -1;
2141         }
2142
2143         /* verify that the size of data matches what we expect */
2144         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2145                                 tickles.connections)
2146                          + sizeof(struct ctdb_tcp_connection)
2147                                  * list->tickles.num) {
2148                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2149                 return -1;
2150         }       
2151
2152         vnn = find_public_ip_vnn(ctdb, &list->addr);
2153         if (vnn == NULL) {
2154                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2155                         ctdb_addr_to_str(&list->addr)));
2156
2157                 return 1;
2158         }
2159
2160         /* remove any old ticklelist we might have */
2161         talloc_free(vnn->tcp_array);
2162         vnn->tcp_array = NULL;
2163
2164         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2165         CTDB_NO_MEMORY(ctdb, tcparray);
2166
2167         tcparray->num = list->tickles.num;
2168
2169         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2170         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2171
2172         memcpy(tcparray->connections, &list->tickles.connections[0], 
2173                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2174
2175         /* We now have a new fresh tickle list array for this vnn */
2176         vnn->tcp_array = talloc_steal(vnn, tcparray);
2177         
2178         return 0;
2179 }
2180
2181 /*
2182   called to return the full list of tickles for the puclic address associated 
2183   with the provided vnn
2184  */
2185 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2186 {
2187         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2188         struct ctdb_control_tcp_tickle_list *list;
2189         struct ctdb_tcp_array *tcparray;
2190         int num;
2191         struct ctdb_vnn *vnn;
2192
2193         vnn = find_public_ip_vnn(ctdb, addr);
2194         if (vnn == NULL) {
2195                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2196                         ctdb_addr_to_str(addr)));
2197
2198                 return 1;
2199         }
2200
2201         tcparray = vnn->tcp_array;
2202         if (tcparray) {
2203                 num = tcparray->num;
2204         } else {
2205                 num = 0;
2206         }
2207
2208         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2209                                 tickles.connections)
2210                         + sizeof(struct ctdb_tcp_connection) * num;
2211
2212         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2213         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2214         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2215
2216         list->addr = *addr;
2217         list->tickles.num = num;
2218         if (num) {
2219                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2220                         sizeof(struct ctdb_tcp_connection) * num);
2221         }
2222
2223         return 0;
2224 }
2225
2226
2227 /*
2228   set the list of all tcp tickles for a public address
2229  */
2230 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2231                               struct timeval timeout, uint32_t destnode, 
2232                               ctdb_sock_addr *addr,
2233                               struct ctdb_tcp_array *tcparray)
2234 {
2235         int ret, num;
2236         TDB_DATA data;
2237         struct ctdb_control_tcp_tickle_list *list;
2238
2239         if (tcparray) {
2240                 num = tcparray->num;
2241         } else {
2242                 num = 0;
2243         }
2244
2245         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2246                                 tickles.connections) +
2247                         sizeof(struct ctdb_tcp_connection) * num;
2248         data.dptr = talloc_size(ctdb, data.dsize);
2249         CTDB_NO_MEMORY(ctdb, data.dptr);
2250
2251         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2252         list->addr = *addr;
2253         list->tickles.num = num;
2254         if (tcparray) {
2255                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2256         }
2257
2258         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2259                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2260                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2261         if (ret != 0) {
2262                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2263                 return -1;
2264         }
2265
2266         talloc_free(data.dptr);
2267
2268         return ret;
2269 }
2270
2271
2272 /*
2273   perform tickle updates if required
2274  */
2275 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2276                                 struct timed_event *te, 
2277                                 struct timeval t, void *private_data)
2278 {
2279         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2280         int ret;
2281         struct ctdb_vnn *vnn;
2282
2283         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2284                 /* we only send out updates for public addresses that 
2285                    we have taken over
2286                  */
2287                 if (ctdb->pnn != vnn->pnn) {
2288                         continue;
2289                 }
2290                 /* We only send out the updates if we need to */
2291                 if (!vnn->tcp_update_needed) {
2292                         continue;
2293                 }
2294                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2295                                 TAKEOVER_TIMEOUT(),
2296                                 CTDB_BROADCAST_CONNECTED,
2297                                 &vnn->public_address,
2298                                 vnn->tcp_array);
2299                 if (ret != 0) {
2300                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2301                                 ctdb_addr_to_str(&vnn->public_address)));
2302                 }
2303         }
2304
2305         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2306                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2307                              ctdb_update_tcp_tickles, ctdb);
2308 }               
2309         
2310
2311 /*
2312   start periodic update of tcp tickles
2313  */
2314 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2315 {
2316         ctdb->tickle_update_context = talloc_new(ctdb);
2317
2318         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2319                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2320                              ctdb_update_tcp_tickles, ctdb);
2321 }
2322
2323
2324
2325
2326 struct control_gratious_arp {
2327         struct ctdb_context *ctdb;
2328         ctdb_sock_addr addr;
2329         const char *iface;
2330         int count;
2331 };
2332
2333 /*
2334   send a control_gratuitous arp
2335  */
2336 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2337                                   struct timeval t, void *private_data)
2338 {
2339         int ret;
2340         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2341                                                         struct control_gratious_arp);
2342
2343         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2344         if (ret != 0) {
2345                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2346                                  arp->iface, strerror(errno)));
2347         }
2348
2349
2350         arp->count++;
2351         if (arp->count == CTDB_ARP_REPEAT) {
2352                 talloc_free(arp);
2353                 return;
2354         }
2355
2356         event_add_timed(arp->ctdb->ev, arp, 
2357                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2358                         send_gratious_arp, arp);
2359 }
2360
2361
2362 /*
2363   send a gratious arp 
2364  */
2365 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2366 {
2367         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2368         struct control_gratious_arp *arp;
2369
2370         /* verify the size of indata */
2371         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2372                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2373                                  (unsigned)indata.dsize, 
2374                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2375                 return -1;
2376         }
2377         if (indata.dsize != 
2378                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2379                 + gratious_arp->len ) ){
2380
2381                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2382                         "but should be %u bytes\n", 
2383                          (unsigned)indata.dsize, 
2384                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2385                 return -1;
2386         }
2387
2388
2389         arp = talloc(ctdb, struct control_gratious_arp);
2390         CTDB_NO_MEMORY(ctdb, arp);
2391
2392         arp->ctdb  = ctdb;
2393         arp->addr   = gratious_arp->addr;
2394         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2395         CTDB_NO_MEMORY(ctdb, arp->iface);
2396         arp->count = 0;
2397         
2398         event_add_timed(arp->ctdb->ev, arp, 
2399                         timeval_zero(), send_gratious_arp, arp);
2400
2401         return 0;
2402 }
2403
2404 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2405 {
2406         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2407         int ret;
2408
2409         /* verify the size of indata */
2410         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2411                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2412                 return -1;
2413         }
2414         if (indata.dsize != 
2415                 ( offsetof(struct ctdb_control_ip_iface, iface)
2416                 + pub->len ) ){
2417
2418                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2419                         "but should be %u bytes\n", 
2420                          (unsigned)indata.dsize, 
2421                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2422                 return -1;
2423         }
2424
2425         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2426
2427         if (ret != 0) {
2428                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2429                 return -1;
2430         }
2431
2432         return 0;
2433 }
2434
2435 /*
2436   called when releaseip event finishes for del_public_address
2437  */
2438 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2439                                 void *private_data)
2440 {
2441         talloc_free(private_data);
2442 }
2443
2444 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2445 {
2446         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2447         struct ctdb_vnn *vnn;
2448         int ret;
2449
2450         /* verify the size of indata */
2451         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2452                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2453                 return -1;
2454         }
2455         if (indata.dsize != 
2456                 ( offsetof(struct ctdb_control_ip_iface, iface)
2457                 + pub->len ) ){
2458
2459                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2460                         "but should be %u bytes\n", 
2461                          (unsigned)indata.dsize, 
2462                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2463                 return -1;
2464         }
2465
2466         /* walk over all public addresses until we find a match */
2467         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2468                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2469                         TALLOC_CTX *mem_ctx;
2470
2471                         DLIST_REMOVE(ctdb->vnn, vnn);
2472                         if (vnn->iface == NULL) {
2473                                 talloc_free(vnn);
2474                                 return 0;
2475                         }
2476
2477                         mem_ctx = talloc_new(ctdb);
2478                         ret = ctdb_event_script_callback(ctdb, 
2479                                          mem_ctx, delete_ip_callback, mem_ctx,
2480                                          false,
2481                                          CTDB_EVENT_RELEASE_IP,
2482                                          "%s %s %u",
2483                                          ctdb_vnn_iface_string(vnn),
2484                                          ctdb_addr_to_str(&vnn->public_address),
2485                                          vnn->public_netmask_bits);
2486                         ctdb_vnn_unassign_iface(ctdb, vnn);
2487                         talloc_free(vnn);
2488                         if (ret != 0) {
2489                                 return -1;
2490                         }
2491                         return 0;
2492                 }
2493         }
2494
2495         return -1;
2496 }
2497