New version 1.2.15
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         i->link_up = false;
69
70         DLIST_ADD(ctdb->ifaces, i);
71
72         return 0;
73 }
74
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
76                                           const char *iface)
77 {
78         struct ctdb_iface *i;
79
80         /* Verify that we dont have an entry for this ip yet */
81         for (i=ctdb->ifaces;i;i=i->next) {
82                 if (strcmp(i->name, iface) == 0) {
83                         return i;
84                 }
85         }
86
87         return NULL;
88 }
89
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
91                                               struct ctdb_vnn *vnn)
92 {
93         int i;
94         struct ctdb_iface *cur = NULL;
95         struct ctdb_iface *best = NULL;
96
97         for (i=0; vnn->ifaces[i]; i++) {
98
99                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
100                 if (cur == NULL) {
101                         continue;
102                 }
103
104                 if (!cur->link_up) {
105                         continue;
106                 }
107
108                 if (best == NULL) {
109                         best = cur;
110                         continue;
111                 }
112
113                 if (cur->references < best->references) {
114                         best = cur;
115                         continue;
116                 }
117         }
118
119         return best;
120 }
121
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123                                      struct ctdb_vnn *vnn)
124 {
125         struct ctdb_iface *best = NULL;
126
127         if (vnn->iface) {
128                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129                                    "still assigned to iface '%s'\n",
130                                    ctdb_addr_to_str(&vnn->public_address),
131                                    ctdb_vnn_iface_string(vnn)));
132                 return 0;
133         }
134
135         best = ctdb_vnn_best_iface(ctdb, vnn);
136         if (best == NULL) {
137                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138                                   "cannot assign to iface any iface\n",
139                                   ctdb_addr_to_str(&vnn->public_address)));
140                 return -1;
141         }
142
143         vnn->iface = best;
144         best->references++;
145         vnn->pnn = ctdb->pnn;
146
147         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148                            "now assigned to iface '%s' refs[%d]\n",
149                            ctdb_addr_to_str(&vnn->public_address),
150                            ctdb_vnn_iface_string(vnn),
151                            best->references));
152         return 0;
153 }
154
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156                                     struct ctdb_vnn *vnn)
157 {
158         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159                            "now unassigned (old iface '%s' refs[%d])\n",
160                            ctdb_addr_to_str(&vnn->public_address),
161                            ctdb_vnn_iface_string(vnn),
162                            vnn->iface?vnn->iface->references:0));
163         if (vnn->iface) {
164                 vnn->iface->references--;
165         }
166         vnn->iface = NULL;
167         if (vnn->pnn == ctdb->pnn) {
168                 vnn->pnn = -1;
169         }
170 }
171
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173                                struct ctdb_vnn *vnn)
174 {
175         int i;
176
177         if (vnn->iface && vnn->iface->link_up) {
178                 return true;
179         }
180
181         for (i=0; vnn->ifaces[i]; i++) {
182                 struct ctdb_iface *cur;
183
184                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
185                 if (cur == NULL) {
186                         continue;
187                 }
188
189                 if (cur->link_up) {
190                         return true;
191                 }
192         }
193
194         return false;
195 }
196
197 struct ctdb_takeover_arp {
198         struct ctdb_context *ctdb;
199         uint32_t count;
200         ctdb_sock_addr addr;
201         struct ctdb_tcp_array *tcparray;
202         struct ctdb_vnn *vnn;
203 };
204
205
206 /*
207   lists of tcp endpoints
208  */
209 struct ctdb_tcp_list {
210         struct ctdb_tcp_list *prev, *next;
211         struct ctdb_tcp_connection connection;
212 };
213
214 /*
215   list of clients to kill on IP release
216  */
217 struct ctdb_client_ip {
218         struct ctdb_client_ip *prev, *next;
219         struct ctdb_context *ctdb;
220         ctdb_sock_addr addr;
221         uint32_t client_id;
222 };
223
224
225 /*
226   send a gratuitous arp
227  */
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
229                                   struct timeval t, void *private_data)
230 {
231         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
232                                                         struct ctdb_takeover_arp);
233         int i, ret;
234         struct ctdb_tcp_array *tcparray;
235         const char *iface = ctdb_vnn_iface_string(arp->vnn);
236
237         ret = ctdb_sys_send_arp(&arp->addr, iface);
238         if (ret != 0) {
239                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240                                   iface, strerror(errno)));
241         }
242
243         tcparray = arp->tcparray;
244         if (tcparray) {
245                 for (i=0;i<tcparray->num;i++) {
246                         struct ctdb_tcp_connection *tcon;
247
248                         tcon = &tcparray->connections[i];
249                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
251                                 ctdb_addr_to_str(&tcon->src_addr),
252                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253                         ret = ctdb_sys_send_tcp(
254                                 &tcon->src_addr, 
255                                 &tcon->dst_addr,
256                                 0, 0, 0);
257                         if (ret != 0) {
258                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259                                         ctdb_addr_to_str(&tcon->src_addr)));
260                         }
261                 }
262         }
263
264         arp->count++;
265
266         if (arp->count == CTDB_ARP_REPEAT) {
267                 talloc_free(arp);
268                 return;
269         }
270
271         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
272                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
273                         ctdb_control_send_arp, arp);
274 }
275
276 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
277                                        struct ctdb_vnn *vnn)
278 {
279         struct ctdb_takeover_arp *arp;
280         struct ctdb_tcp_array *tcparray;
281
282         if (!vnn->takeover_ctx) {
283                 vnn->takeover_ctx = talloc_new(vnn);
284                 if (!vnn->takeover_ctx) {
285                         return -1;
286                 }
287         }
288
289         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
290         if (!arp) {
291                 return -1;
292         }
293
294         arp->ctdb = ctdb;
295         arp->addr = vnn->public_address;
296         arp->vnn  = vnn;
297
298         tcparray = vnn->tcp_array;
299         if (tcparray) {
300                 /* add all of the known tcp connections for this IP to the
301                    list of tcp connections to send tickle acks for */
302                 arp->tcparray = talloc_steal(arp, tcparray);
303
304                 vnn->tcp_array = NULL;
305                 vnn->tcp_update_needed = true;
306         }
307
308         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
309                         timeval_zero(), ctdb_control_send_arp, arp);
310
311         return 0;
312 }
313
314 struct takeover_callback_state {
315         struct ctdb_req_control *c;
316         ctdb_sock_addr *addr;
317         struct ctdb_vnn *vnn;
318 };
319
320 struct ctdb_do_takeip_state {
321         struct ctdb_req_control *c;
322         struct ctdb_vnn *vnn;
323 };
324
325 /*
326   called when takeip event finishes
327  */
328 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
329                                     void *private_data)
330 {
331         struct ctdb_do_takeip_state *state =
332                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
333         int32_t ret;
334         TDB_DATA data;
335
336         if (status != 0) {
337                 if (status == -ETIME) {
338                         ctdb_ban_self(ctdb);
339                 }
340                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
341                                  ctdb_addr_to_str(&state->vnn->public_address),
342                                  ctdb_vnn_iface_string(state->vnn)));
343                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
344                 talloc_free(state);
345                 return;
346         }
347
348         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
349         if (ret != 0) {
350                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
351                 talloc_free(state);
352                 return;
353         }
354
355         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
356         data.dsize = strlen((char *)data.dptr) + 1;
357         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
358
359         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
360
361
362         /* the control succeeded */
363         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
364         talloc_free(state);
365         return;
366 }
367
368 /*
369   take over an ip address
370  */
371 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
372                               struct ctdb_req_control *c,
373                               struct ctdb_vnn *vnn)
374 {
375         int ret;
376         struct ctdb_do_takeip_state *state;
377
378         ret = ctdb_vnn_assign_iface(ctdb, vnn);
379         if (ret != 0) {
380                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
381                                  "assin a usable interface\n",
382                                  ctdb_addr_to_str(&vnn->public_address),
383                                  vnn->public_netmask_bits));
384                 return -1;
385         }
386
387         state = talloc(vnn, struct ctdb_do_takeip_state);
388         CTDB_NO_MEMORY(ctdb, state);
389
390         state->c = talloc_steal(ctdb, c);
391         state->vnn   = vnn;
392
393         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
394                             ctdb_addr_to_str(&vnn->public_address),
395                             vnn->public_netmask_bits,
396                             ctdb_vnn_iface_string(vnn)));
397
398         ret = ctdb_event_script_callback(ctdb,
399                                          state,
400                                          ctdb_do_takeip_callback,
401                                          state,
402                                          false,
403                                          CTDB_EVENT_TAKE_IP,
404                                          "%s %s %u",
405                                          ctdb_vnn_iface_string(vnn),
406                                          ctdb_addr_to_str(&vnn->public_address),
407                                          vnn->public_netmask_bits);
408
409         if (ret != 0) {
410                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
411                         ctdb_addr_to_str(&vnn->public_address),
412                         ctdb_vnn_iface_string(vnn)));
413                 talloc_free(state);
414                 return -1;
415         }
416
417         return 0;
418 }
419
420 struct ctdb_do_updateip_state {
421         struct ctdb_req_control *c;
422         struct ctdb_iface *old;
423         struct ctdb_vnn *vnn;
424 };
425
426 /*
427   called when updateip event finishes
428  */
429 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
430                                       void *private_data)
431 {
432         struct ctdb_do_updateip_state *state =
433                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
434         int32_t ret;
435
436         if (status != 0) {
437                 if (status == -ETIME) {
438                         ctdb_ban_self(ctdb);
439                 }
440                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
441                         ctdb_addr_to_str(&state->vnn->public_address),
442                         state->old->name,
443                         ctdb_vnn_iface_string(state->vnn)));
444
445                 /*
446                  * All we can do is reset the old interface
447                  * and let the next run fix it
448                  */
449                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
450                 state->vnn->iface = state->old;
451                 state->vnn->iface->references++;
452
453                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
454                 talloc_free(state);
455                 return;
456         }
457
458         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
459         if (ret != 0) {
460                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
461                 talloc_free(state);
462                 return;
463         }
464
465         /* the control succeeded */
466         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
467         talloc_free(state);
468         return;
469 }
470
471 /*
472   update (move) an ip address
473  */
474 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
475                                 struct ctdb_req_control *c,
476                                 struct ctdb_vnn *vnn)
477 {
478         int ret;
479         struct ctdb_do_updateip_state *state;
480         struct ctdb_iface *old = vnn->iface;
481
482         ctdb_vnn_unassign_iface(ctdb, vnn);
483         ret = ctdb_vnn_assign_iface(ctdb, vnn);
484         if (ret != 0) {
485                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
486                                  "assin a usable interface (old iface '%s')\n",
487                                  ctdb_addr_to_str(&vnn->public_address),
488                                  vnn->public_netmask_bits,
489                                  old->name));
490                 return -1;
491         }
492
493         state = talloc(vnn, struct ctdb_do_updateip_state);
494         CTDB_NO_MEMORY(ctdb, state);
495
496         state->c = talloc_steal(ctdb, c);
497         state->old = old;
498         state->vnn = vnn;
499
500         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
501                             "interface %s to %s\n",
502                             ctdb_addr_to_str(&vnn->public_address),
503                             vnn->public_netmask_bits,
504                             old->name,
505                             ctdb_vnn_iface_string(vnn)));
506
507         ret = ctdb_event_script_callback(ctdb,
508                                          state,
509                                          ctdb_do_updateip_callback,
510                                          state,
511                                          false,
512                                          CTDB_EVENT_UPDATE_IP,
513                                          "%s %s %s %u",
514                                          state->old->name,
515                                          ctdb_vnn_iface_string(vnn),
516                                          ctdb_addr_to_str(&vnn->public_address),
517                                          vnn->public_netmask_bits);
518         if (ret != 0) {
519                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
520                                  ctdb_addr_to_str(&vnn->public_address),
521                                  old->name, ctdb_vnn_iface_string(vnn)));
522                 talloc_free(state);
523                 return -1;
524         }
525
526         return 0;
527 }
528
529 /*
530   Find the vnn of the node that has a public ip address
531   returns -1 if the address is not known as a public address
532  */
533 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
534 {
535         struct ctdb_vnn *vnn;
536
537         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
538                 if (ctdb_same_ip(&vnn->public_address, addr)) {
539                         return vnn;
540                 }
541         }
542
543         return NULL;
544 }
545
546 /*
547   take over an ip address
548  */
549 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
550                                  struct ctdb_req_control *c,
551                                  TDB_DATA indata,
552                                  bool *async_reply)
553 {
554         int ret;
555         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
556         struct ctdb_vnn *vnn;
557         bool have_ip = false;
558         bool do_updateip = false;
559         bool do_takeip = false;
560         struct ctdb_iface *best_iface = NULL;
561
562         if (pip->pnn != ctdb->pnn) {
563                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
564                                  "with pnn %d, but we're node %d\n",
565                                  ctdb_addr_to_str(&pip->addr),
566                                  pip->pnn, ctdb->pnn));
567                 return -1;
568         }
569
570         /* update out vnn list */
571         vnn = find_public_ip_vnn(ctdb, &pip->addr);
572         if (vnn == NULL) {
573                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
574                         ctdb_addr_to_str(&pip->addr)));
575                 return 0;
576         }
577
578         have_ip = ctdb_sys_have_ip(&pip->addr);
579         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
580         if (best_iface == NULL) {
581                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
582                                  "a usable interface (old %s, have_ip %d)\n",
583                                  ctdb_addr_to_str(&vnn->public_address),
584                                  vnn->public_netmask_bits,
585                                  ctdb_vnn_iface_string(vnn),
586                                  have_ip));
587                 return -1;
588         }
589
590         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
591                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
592                 have_ip = false;
593         }
594
595         if (vnn->iface == NULL && have_ip) {
596                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
597                                   "but we have no interface assigned, has someone manually configured it?"
598                                   "banning ourself\n",
599                                  ctdb_addr_to_str(&vnn->public_address)));
600                 ctdb_ban_self(ctdb);
601                 return -1;
602         }
603
604         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
605                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
606                                   "and we have it on iface[%s], but it was assigned to node %d"
607                                   "and we are node %d, banning ourself\n",
608                                  ctdb_addr_to_str(&vnn->public_address),
609                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
610                 ctdb_ban_self(ctdb);
611                 return -1;
612         }
613
614         if (vnn->pnn == -1 && have_ip) {
615                 vnn->pnn = ctdb->pnn;
616                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
617                                   "and we already have it on iface[%s], update local daemon\n",
618                                  ctdb_addr_to_str(&vnn->public_address),
619                                   ctdb_vnn_iface_string(vnn)));
620                 return 0;
621         }
622
623         if (vnn->iface) {
624                 if (vnn->iface->link_up) {
625                         /* only move when the rebalance gains something */
626                         if (vnn->iface->references > (best_iface->references + 1)) {
627                                 do_updateip = true;
628                         }
629                 } else if (vnn->iface != best_iface) {
630                         do_updateip = true;
631                 }
632         }
633
634         if (!have_ip) {
635                 if (do_updateip) {
636                         ctdb_vnn_unassign_iface(ctdb, vnn);
637                         do_updateip = false;
638                 }
639                 do_takeip = true;
640         }
641
642         if (do_takeip) {
643                 ret = ctdb_do_takeip(ctdb, c, vnn);
644                 if (ret != 0) {
645                         return -1;
646                 }
647         } else if (do_updateip) {
648                 ret = ctdb_do_updateip(ctdb, c, vnn);
649                 if (ret != 0) {
650                         return -1;
651                 }
652         } else {
653                 /*
654                  * The interface is up and the kernel known the ip
655                  * => do nothing
656                  */
657                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
658                         ctdb_addr_to_str(&pip->addr),
659                         vnn->public_netmask_bits,
660                         ctdb_vnn_iface_string(vnn)));
661                 return 0;
662         }
663
664         /* tell ctdb_control.c that we will be replying asynchronously */
665         *async_reply = true;
666
667         return 0;
668 }
669
670 /*
671   takeover an ip address old v4 style
672  */
673 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
674                                 struct ctdb_req_control *c,
675                                 TDB_DATA indata, 
676                                 bool *async_reply)
677 {
678         TDB_DATA data;
679         
680         data.dsize = sizeof(struct ctdb_public_ip);
681         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
682         CTDB_NO_MEMORY(ctdb, data.dptr);
683         
684         memcpy(data.dptr, indata.dptr, indata.dsize);
685         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
686 }
687
688 /*
689   kill any clients that are registered with a IP that is being released
690  */
691 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
692 {
693         struct ctdb_client_ip *ip;
694
695         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
696                 ctdb_addr_to_str(addr)));
697
698         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
699                 ctdb_sock_addr tmp_addr;
700
701                 tmp_addr = ip->addr;
702                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
703                         ip->client_id,
704                         ctdb_addr_to_str(&ip->addr)));
705
706                 if (ctdb_same_ip(&tmp_addr, addr)) {
707                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
708                                                                      ip->client_id, 
709                                                                      struct ctdb_client);
710                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
711                                 ip->client_id,
712                                 ctdb_addr_to_str(&ip->addr),
713                                 client->pid));
714
715                         if (client->pid != 0) {
716                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
717                                         (unsigned)client->pid,
718                                         ctdb_addr_to_str(addr),
719                                         ip->client_id));
720                                 kill(client->pid, SIGKILL);
721                         }
722                 }
723         }
724 }
725
726 /*
727   called when releaseip event finishes
728  */
729 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
730                                 void *private_data)
731 {
732         struct takeover_callback_state *state = 
733                 talloc_get_type(private_data, struct takeover_callback_state);
734         TDB_DATA data;
735
736         if (status == -ETIME) {
737                 ctdb_ban_self(ctdb);
738         }
739
740         /* send a message to all clients of this node telling them
741            that the cluster has been reconfigured and they should
742            release any sockets on this IP */
743         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
744         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
745         data.dsize = strlen((char *)data.dptr)+1;
746
747         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
748
749         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
750
751         /* kill clients that have registered with this IP */
752         release_kill_clients(ctdb, state->addr);
753
754         ctdb_vnn_unassign_iface(ctdb, state->vnn);
755
756         /* the control succeeded */
757         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
758         talloc_free(state);
759 }
760
761 /*
762   release an ip address
763  */
764 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
765                                 struct ctdb_req_control *c,
766                                 TDB_DATA indata, 
767                                 bool *async_reply)
768 {
769         int ret;
770         struct takeover_callback_state *state;
771         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
772         struct ctdb_vnn *vnn;
773
774         /* update our vnn list */
775         vnn = find_public_ip_vnn(ctdb, &pip->addr);
776         if (vnn == NULL) {
777                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
778                         ctdb_addr_to_str(&pip->addr)));
779                 return 0;
780         }
781         vnn->pnn = pip->pnn;
782
783         /* stop any previous arps */
784         talloc_free(vnn->takeover_ctx);
785         vnn->takeover_ctx = NULL;
786
787         if (!ctdb_sys_have_ip(&pip->addr)) {
788                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
789                         ctdb_addr_to_str(&pip->addr),
790                         vnn->public_netmask_bits, 
791                         ctdb_vnn_iface_string(vnn)));
792                 ctdb_vnn_unassign_iface(ctdb, vnn);
793                 return 0;
794         }
795
796         if (vnn->iface == NULL) {
797                 DEBUG(DEBUG_CRIT,(__location__ " release_ip of IP %s is known to the kernel, "
798                                   "but we have no interface assigned, has someone manually configured it?"
799                                   "banning ourself\n",
800                                  ctdb_addr_to_str(&vnn->public_address)));
801                 ctdb_ban_self(ctdb);
802                 return -1;
803         }
804
805         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
806                 ctdb_addr_to_str(&pip->addr),
807                 vnn->public_netmask_bits, 
808                 ctdb_vnn_iface_string(vnn),
809                 pip->pnn));
810
811         state = talloc(ctdb, struct takeover_callback_state);
812         CTDB_NO_MEMORY(ctdb, state);
813
814         state->c = talloc_steal(state, c);
815         state->addr = talloc(state, ctdb_sock_addr);       
816         CTDB_NO_MEMORY(ctdb, state->addr);
817         *state->addr = pip->addr;
818         state->vnn   = vnn;
819
820         ret = ctdb_event_script_callback(ctdb, 
821                                          state, release_ip_callback, state,
822                                          false,
823                                          CTDB_EVENT_RELEASE_IP,
824                                          "%s %s %u",
825                                          ctdb_vnn_iface_string(vnn),
826                                          ctdb_addr_to_str(&pip->addr),
827                                          vnn->public_netmask_bits);
828         if (ret != 0) {
829                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
830                         ctdb_addr_to_str(&pip->addr),
831                         ctdb_vnn_iface_string(vnn)));
832                 talloc_free(state);
833                 return -1;
834         }
835
836         /* tell the control that we will be reply asynchronously */
837         *async_reply = true;
838         return 0;
839 }
840
841 /*
842   release an ip address old v4 style
843  */
844 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
845                                 struct ctdb_req_control *c,
846                                 TDB_DATA indata, 
847                                 bool *async_reply)
848 {
849         TDB_DATA data;
850         
851         data.dsize = sizeof(struct ctdb_public_ip);
852         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
853         CTDB_NO_MEMORY(ctdb, data.dptr);
854         
855         memcpy(data.dptr, indata.dptr, indata.dsize);
856         return ctdb_control_release_ip(ctdb, c, data, async_reply);
857 }
858
859
860 static int ctdb_add_public_address(struct ctdb_context *ctdb,
861                                    ctdb_sock_addr *addr,
862                                    unsigned mask, const char *ifaces)
863 {
864         struct ctdb_vnn      *vnn;
865         uint32_t num = 0;
866         char *tmp;
867         const char *iface;
868         int i;
869         int ret;
870
871         /* Verify that we dont have an entry for this ip yet */
872         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
873                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
874                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
875                                 ctdb_addr_to_str(addr)));
876                         return -1;
877                 }               
878         }
879
880         /* create a new vnn structure for this ip address */
881         vnn = talloc_zero(ctdb, struct ctdb_vnn);
882         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
883         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
884         tmp = talloc_strdup(vnn, ifaces);
885         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
886         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
887                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
888                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
889                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
890                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
891                 num++;
892         }
893         talloc_free(tmp);
894         vnn->ifaces[num] = NULL;
895         vnn->public_address      = *addr;
896         vnn->public_netmask_bits = mask;
897         vnn->pnn                 = -1;
898         if (ctdb_sys_have_ip(addr)) {
899                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
900                 vnn->pnn = ctdb->pnn;
901         }
902
903         for (i=0; vnn->ifaces[i]; i++) {
904                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
905                 if (ret != 0) {
906                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
907                                            "for public_address[%s]\n",
908                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
909                         talloc_free(vnn);
910                         return -1;
911                 }
912                 if (i == 0) {
913                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
914                 }
915         }
916
917         DLIST_ADD(ctdb->vnn, vnn);
918
919         return 0;
920 }
921
922 /*
923   setup the event script directory
924 */
925 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
926 {
927         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
928         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
929         return 0;
930 }
931
932 /*
933   setup the public address lists from a file
934 */
935 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
936 {
937         char **lines;
938         int nlines;
939         int i;
940
941         lines = file_lines_load(alist, &nlines, ctdb);
942         if (lines == NULL) {
943                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
944                 return -1;
945         }
946         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
947                 nlines--;
948         }
949
950         for (i=0;i<nlines;i++) {
951                 unsigned mask;
952                 ctdb_sock_addr addr;
953                 const char *addrstr;
954                 const char *ifaces;
955                 char *tok, *line;
956
957                 line = lines[i];
958                 while ((*line == ' ') || (*line == '\t')) {
959                         line++;
960                 }
961                 if (*line == '#') {
962                         continue;
963                 }
964                 if (strcmp(line, "") == 0) {
965                         continue;
966                 }
967                 tok = strtok(line, " \t");
968                 addrstr = tok;
969                 tok = strtok(NULL, " \t");
970                 if (tok == NULL) {
971                         if (NULL == ctdb->default_public_interface) {
972                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
973                                          i+1));
974                                 talloc_free(lines);
975                                 return -1;
976                         }
977                         ifaces = ctdb->default_public_interface;
978                 } else {
979                         ifaces = tok;
980                 }
981
982                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
983                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
984                         talloc_free(lines);
985                         return -1;
986                 }
987                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
988                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
989                         talloc_free(lines);
990                         return -1;
991                 }
992         }
993
994         talloc_free(lines);
995         return 0;
996 }
997
998 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
999                               const char *iface,
1000                               const char *ip)
1001 {
1002         struct ctdb_vnn *svnn;
1003         struct ctdb_iface *cur = NULL;
1004         bool ok;
1005         int ret;
1006
1007         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1008         CTDB_NO_MEMORY(ctdb, svnn);
1009
1010         svnn->ifaces = talloc_array(svnn, const char *, 2);
1011         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1012         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1013         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1014         svnn->ifaces[1] = NULL;
1015
1016         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1017         if (!ok) {
1018                 talloc_free(svnn);
1019                 return -1;
1020         }
1021
1022         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1023         if (ret != 0) {
1024                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1025                                    "for single_ip[%s]\n",
1026                                    svnn->ifaces[0],
1027                                    ctdb_addr_to_str(&svnn->public_address)));
1028                 talloc_free(svnn);
1029                 return -1;
1030         }
1031
1032         /* assume the single public ip interface is initially "good" */
1033         cur = ctdb_find_iface(ctdb, iface);
1034         if (cur == NULL) {
1035                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1036                 return -1;
1037         }
1038         cur->link_up = true;
1039
1040         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1041         if (ret != 0) {
1042                 talloc_free(svnn);
1043                 return -1;
1044         }
1045
1046         ctdb->single_ip_vnn = svnn;
1047         return 0;
1048 }
1049
1050 struct ctdb_public_ip_list {
1051         struct ctdb_public_ip_list *next;
1052         uint32_t pnn;
1053         ctdb_sock_addr addr;
1054 };
1055
1056
1057 /* Given a physical node, return the number of
1058    public addresses that is currently assigned to this node.
1059 */
1060 static int node_ip_coverage(struct ctdb_context *ctdb, 
1061         int32_t pnn,
1062         struct ctdb_public_ip_list *ips)
1063 {
1064         int num=0;
1065
1066         for (;ips;ips=ips->next) {
1067                 if (ips->pnn == pnn) {
1068                         num++;
1069                 }
1070         }
1071         return num;
1072 }
1073
1074
1075 /* Check if this is a public ip known to the node, i.e. can that
1076    node takeover this ip ?
1077 */
1078 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1079                 struct ctdb_public_ip_list *ip)
1080 {
1081         struct ctdb_all_public_ips *public_ips;
1082         int i;
1083
1084         public_ips = ctdb->nodes[pnn]->available_public_ips;
1085
1086         if (public_ips == NULL) {
1087                 return -1;
1088         }
1089
1090         for (i=0;i<public_ips->num;i++) {
1091                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1092                         /* yes, this node can serve this public ip */
1093                         return 0;
1094                 }
1095         }
1096
1097         return -1;
1098 }
1099
1100
1101 /* search the node lists list for a node to takeover this ip.
1102    pick the node that currently are serving the least number of ips
1103    so that the ips get spread out evenly.
1104 */
1105 static int find_takeover_node(struct ctdb_context *ctdb, 
1106                 struct ctdb_node_map *nodemap, uint32_t mask, 
1107                 struct ctdb_public_ip_list *ip,
1108                 struct ctdb_public_ip_list *all_ips)
1109 {
1110         int pnn, min=0, num;
1111         int i;
1112
1113         pnn    = -1;
1114         for (i=0;i<nodemap->num;i++) {
1115                 if (nodemap->nodes[i].flags & mask) {
1116                         /* This node is not healty and can not be used to serve
1117                            a public address 
1118                         */
1119                         continue;
1120                 }
1121
1122                 /* verify that this node can serve this ip */
1123                 if (can_node_serve_ip(ctdb, i, ip)) {
1124                         /* no it couldnt   so skip to the next node */
1125                         continue;
1126                 }
1127
1128                 num = node_ip_coverage(ctdb, i, all_ips);
1129                 /* was this the first node we checked ? */
1130                 if (pnn == -1) {
1131                         pnn = i;
1132                         min  = num;
1133                 } else {
1134                         if (num < min) {
1135                                 pnn = i;
1136                                 min  = num;
1137                         }
1138                 }
1139         }       
1140         if (pnn == -1) {
1141                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1142                         ctdb_addr_to_str(&ip->addr)));
1143
1144                 return -1;
1145         }
1146
1147         ip->pnn = pnn;
1148         return 0;
1149 }
1150
1151 #define IP_KEYLEN       4
1152 static uint32_t *ip_key(ctdb_sock_addr *ip)
1153 {
1154         static uint32_t key[IP_KEYLEN];
1155
1156         bzero(key, sizeof(key));
1157
1158         switch (ip->sa.sa_family) {
1159         case AF_INET:
1160                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1161                 break;
1162         case AF_INET6:
1163                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1164                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1165                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1166                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1167                 break;
1168         default:
1169                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1170                 return key;
1171         }
1172
1173         return key;
1174 }
1175
1176 static void *add_ip_callback(void *parm, void *data)
1177 {
1178         struct ctdb_public_ip_list *this_ip = parm; 
1179         struct ctdb_public_ip_list *prev_ip = data; 
1180
1181         if (prev_ip == NULL) {
1182                 return parm;
1183         }
1184         if (this_ip->pnn == -1) {
1185                 this_ip->pnn = prev_ip->pnn;
1186         }
1187
1188         return parm;
1189 }
1190
1191 void getips_count_callback(void *param, void *data)
1192 {
1193         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1194         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1195
1196         new_ip->next = *ip_list;
1197         *ip_list     = new_ip;
1198 }
1199
1200 static struct ctdb_public_ip_list *
1201 create_merged_ip_list(struct ctdb_context *ctdb)
1202 {
1203         int i, j;
1204         struct ctdb_public_ip_list *ip_list;
1205         struct ctdb_all_public_ips *public_ips;
1206
1207         if (ctdb->ip_tree != NULL) {
1208                 talloc_free(ctdb->ip_tree);
1209                 ctdb->ip_tree = NULL;
1210         }
1211         ctdb->ip_tree = trbt_create(ctdb, 0);
1212
1213         for (i=0;i<ctdb->num_nodes;i++) {
1214                 public_ips = ctdb->nodes[i]->known_public_ips;
1215
1216                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1217                         continue;
1218                 }
1219
1220                 /* there were no public ips for this node */
1221                 if (public_ips == NULL) {
1222                         continue;
1223                 }               
1224
1225                 for (j=0;j<public_ips->num;j++) {
1226                         struct ctdb_public_ip_list *tmp_ip; 
1227
1228                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1229                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1230                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1231                         tmp_ip->addr = public_ips->ips[j].addr;
1232                         tmp_ip->next = NULL;
1233
1234                         trbt_insertarray32_callback(ctdb->ip_tree,
1235                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1236                                 add_ip_callback,
1237                                 tmp_ip);
1238                 }
1239         }
1240
1241         ip_list = NULL;
1242         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1243
1244         return ip_list;
1245 }
1246
1247 /*
1248   make any IP alias changes for public addresses that are necessary 
1249  */
1250 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1251 {
1252         int i, num_healthy, retries;
1253         struct ctdb_public_ip ip;
1254         struct ctdb_public_ipv4 ipv4;
1255         uint32_t mask, *nodes;
1256         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1257         int maxnode, maxnum=0, minnode, minnum=0, num;
1258         TDB_DATA data;
1259         struct timeval timeout;
1260         struct client_async_data *async_data;
1261         struct ctdb_client_control_state *state;
1262         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1263
1264         /*
1265          * ip failover is completely disabled, just send out the 
1266          * ipreallocated event.
1267          */
1268         if (ctdb->tunable.disable_ip_failover != 0) {
1269                 goto ipreallocated;
1270         }
1271
1272         ZERO_STRUCT(ip);
1273
1274         /* Count how many completely healthy nodes we have */
1275         num_healthy = 0;
1276         for (i=0;i<nodemap->num;i++) {
1277                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1278                         num_healthy++;
1279                 }
1280         }
1281
1282         if (num_healthy > 0) {
1283                 /* We have healthy nodes, so only consider them for 
1284                    serving public addresses
1285                 */
1286                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1287         } else {
1288                 /* We didnt have any completely healthy nodes so
1289                    use "disabled" nodes as a fallback
1290                 */
1291                 mask = NODE_FLAGS_INACTIVE;
1292         }
1293
1294         /* since nodes only know about those public addresses that
1295            can be served by that particular node, no single node has
1296            a full list of all public addresses that exist in the cluster.
1297            Walk over all node structures and create a merged list of
1298            all public addresses that exist in the cluster.
1299
1300            keep the tree of ips around as ctdb->ip_tree
1301         */
1302         all_ips = create_merged_ip_list(ctdb);
1303
1304         /* If we want deterministic ip allocations, i.e. that the ip addresses
1305            will always be allocated the same way for a specific set of
1306            available/unavailable nodes.
1307         */
1308         if (1 == ctdb->tunable.deterministic_public_ips) {              
1309                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1310                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1311                         tmp_ip->pnn = i%nodemap->num;
1312                 }
1313         }
1314
1315
1316         /* mark all public addresses with a masked node as being served by
1317            node -1
1318         */
1319         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1320                 if (tmp_ip->pnn == -1) {
1321                         continue;
1322                 }
1323                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1324                         tmp_ip->pnn = -1;
1325                 }
1326         }
1327
1328         /* verify that the assigned nodes can serve that public ip
1329            and set it to -1 if not
1330         */
1331         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1332                 if (tmp_ip->pnn == -1) {
1333                         continue;
1334                 }
1335                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1336                         /* this node can not serve this ip. */
1337                         tmp_ip->pnn = -1;
1338                 }
1339         }
1340
1341
1342         /* now we must redistribute all public addresses with takeover node
1343            -1 among the nodes available
1344         */
1345         retries = 0;
1346 try_again:
1347         /* loop over all ip's and find a physical node to cover for 
1348            each unassigned ip.
1349         */
1350         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1351                 if (tmp_ip->pnn == -1) {
1352                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1353                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1354                                         ctdb_addr_to_str(&tmp_ip->addr)));
1355                         }
1356                 }
1357         }
1358
1359         /* If we dont want ips to fail back after a node becomes healthy
1360            again, we wont even try to reallocat the ip addresses so that
1361            they are evenly spread out.
1362            This can NOT be used at the same time as DeterministicIPs !
1363         */
1364         if (1 == ctdb->tunable.no_ip_failback) {
1365                 if (1 == ctdb->tunable.deterministic_public_ips) {
1366                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1367                 }
1368                 goto finished;
1369         }
1370
1371
1372         /* now, try to make sure the ip adresses are evenly distributed
1373            across the node.
1374            for each ip address, loop over all nodes that can serve this
1375            ip and make sure that the difference between the node
1376            serving the most and the node serving the least ip's are not greater
1377            than 1.
1378         */
1379         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1380                 if (tmp_ip->pnn == -1) {
1381                         continue;
1382                 }
1383
1384                 /* Get the highest and lowest number of ips's served by any 
1385                    valid node which can serve this ip.
1386                 */
1387                 maxnode = -1;
1388                 minnode = -1;
1389                 for (i=0;i<nodemap->num;i++) {
1390                         if (nodemap->nodes[i].flags & mask) {
1391                                 continue;
1392                         }
1393
1394                         /* only check nodes that can actually serve this ip */
1395                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1396                                 /* no it couldnt   so skip to the next node */
1397                                 continue;
1398                         }
1399
1400                         num = node_ip_coverage(ctdb, i, all_ips);
1401                         if (maxnode == -1) {
1402                                 maxnode = i;
1403                                 maxnum  = num;
1404                         } else {
1405                                 if (num > maxnum) {
1406                                         maxnode = i;
1407                                         maxnum  = num;
1408                                 }
1409                         }
1410                         if (minnode == -1) {
1411                                 minnode = i;
1412                                 minnum  = num;
1413                         } else {
1414                                 if (num < minnum) {
1415                                         minnode = i;
1416                                         minnum  = num;
1417                                 }
1418                         }
1419                 }
1420                 if (maxnode == -1) {
1421                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1422                                 ctdb_addr_to_str(&tmp_ip->addr)));
1423
1424                         continue;
1425                 }
1426
1427                 /* If we want deterministic IPs then dont try to reallocate 
1428                    them to spread out the load.
1429                 */
1430                 if (1 == ctdb->tunable.deterministic_public_ips) {
1431                         continue;
1432                 }
1433
1434                 /* if the spread between the smallest and largest coverage by
1435                    a node is >=2 we steal one of the ips from the node with
1436                    most coverage to even things out a bit.
1437                    try to do this at most 5 times  since we dont want to spend
1438                    too much time balancing the ip coverage.
1439                 */
1440                 if ( (maxnum > minnum+1)
1441                   && (retries < 5) ){
1442                         struct ctdb_public_ip_list *tmp;
1443
1444                         /* mark one of maxnode's vnn's as unassigned and try
1445                            again
1446                         */
1447                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1448                                 if (tmp->pnn == maxnode) {
1449                                         tmp->pnn = -1;
1450                                         retries++;
1451                                         goto try_again;
1452                                 }
1453                         }
1454                 }
1455         }
1456
1457
1458         /* finished distributing the public addresses, now just send the 
1459            info out to the nodes
1460         */
1461 finished:
1462
1463         /* at this point ->pnn is the node which will own each IP
1464            or -1 if there is no node that can cover this ip
1465         */
1466
1467         /* now tell all nodes to delete any alias that they should not
1468            have.  This will be a NOOP on nodes that don't currently
1469            hold the given alias */
1470         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1471         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1472
1473         for (i=0;i<nodemap->num;i++) {
1474                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1475                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1476                         continue;
1477                 }
1478
1479                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1480                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1481                                 /* This node should be serving this
1482                                    vnn so dont tell it to release the ip
1483                                 */
1484                                 continue;
1485                         }
1486                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1487                                 ipv4.pnn = tmp_ip->pnn;
1488                                 ipv4.sin = tmp_ip->addr.ip;
1489
1490                                 timeout = TAKEOVER_TIMEOUT();
1491                                 data.dsize = sizeof(ipv4);
1492                                 data.dptr  = (uint8_t *)&ipv4;
1493                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1494                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1495                                                 data, async_data,
1496                                                 &timeout, NULL);
1497                         } else {
1498                                 ip.pnn  = tmp_ip->pnn;
1499                                 ip.addr = tmp_ip->addr;
1500
1501                                 timeout = TAKEOVER_TIMEOUT();
1502                                 data.dsize = sizeof(ip);
1503                                 data.dptr  = (uint8_t *)&ip;
1504                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1505                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1506                                                 data, async_data,
1507                                                 &timeout, NULL);
1508                         }
1509
1510                         if (state == NULL) {
1511                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1512                                 talloc_free(tmp_ctx);
1513                                 return -1;
1514                         }
1515                 
1516                         ctdb_client_async_add(async_data, state);
1517                 }
1518         }
1519         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1520                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1521                 talloc_free(tmp_ctx);
1522                 return -1;
1523         }
1524         talloc_free(async_data);
1525
1526
1527         /* tell all nodes to get their own IPs */
1528         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1529         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1530         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1531                 if (tmp_ip->pnn == -1) {
1532                         /* this IP won't be taken over */
1533                         continue;
1534                 }
1535
1536                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1537                         ipv4.pnn = tmp_ip->pnn;
1538                         ipv4.sin = tmp_ip->addr.ip;
1539
1540                         timeout = TAKEOVER_TIMEOUT();
1541                         data.dsize = sizeof(ipv4);
1542                         data.dptr  = (uint8_t *)&ipv4;
1543                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1544                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1545                                         data, async_data,
1546                                         &timeout, NULL);
1547                 } else {
1548                         ip.pnn  = tmp_ip->pnn;
1549                         ip.addr = tmp_ip->addr;
1550
1551                         timeout = TAKEOVER_TIMEOUT();
1552                         data.dsize = sizeof(ip);
1553                         data.dptr  = (uint8_t *)&ip;
1554                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1555                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1556                                         data, async_data,
1557                                         &timeout, NULL);
1558                 }
1559                 if (state == NULL) {
1560                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1561                         talloc_free(tmp_ctx);
1562                         return -1;
1563                 }
1564                 
1565                 ctdb_client_async_add(async_data, state);
1566         }
1567         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1568                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1569                 talloc_free(tmp_ctx);
1570                 return -1;
1571         }
1572
1573 ipreallocated:
1574         /* tell all nodes to update natwg */
1575         /* send the flags update natgw on all connected nodes */
1576         data.dptr  = discard_const("ipreallocated");
1577         data.dsize = strlen((char *)data.dptr) + 1; 
1578         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1579         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
1580                                       nodes, 0, TAKEOVER_TIMEOUT(),
1581                                       false, data,
1582                                       NULL, NULL,
1583                                       NULL) != 0) {
1584                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
1585         }
1586
1587         talloc_free(tmp_ctx);
1588         return 0;
1589 }
1590
1591
1592 /*
1593   destroy a ctdb_client_ip structure
1594  */
1595 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1596 {
1597         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1598                 ctdb_addr_to_str(&ip->addr),
1599                 ntohs(ip->addr.ip.sin_port),
1600                 ip->client_id));
1601
1602         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1603         return 0;
1604 }
1605
1606 /*
1607   called by a client to inform us of a TCP connection that it is managing
1608   that should tickled with an ACK when IP takeover is done
1609   we handle both the old ipv4 style of packets as well as the new ipv4/6
1610   pdus.
1611  */
1612 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1613                                 TDB_DATA indata)
1614 {
1615         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1616         struct ctdb_control_tcp *old_addr = NULL;
1617         struct ctdb_control_tcp_addr new_addr;
1618         struct ctdb_control_tcp_addr *tcp_sock = NULL;
1619         struct ctdb_tcp_list *tcp;
1620         struct ctdb_tcp_connection t;
1621         int ret;
1622         TDB_DATA data;
1623         struct ctdb_client_ip *ip;
1624         struct ctdb_vnn *vnn;
1625         ctdb_sock_addr addr;
1626
1627         switch (indata.dsize) {
1628         case sizeof(struct ctdb_control_tcp):
1629                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1630                 ZERO_STRUCT(new_addr);
1631                 tcp_sock = &new_addr;
1632                 tcp_sock->src.ip  = old_addr->src;
1633                 tcp_sock->dest.ip = old_addr->dest;
1634                 break;
1635         case sizeof(struct ctdb_control_tcp_addr):
1636                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1637                 break;
1638         default:
1639                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1640                                  "to ctdb_control_tcp_client. size was %d but "
1641                                  "only allowed sizes are %lu and %lu\n",
1642                                  (int)indata.dsize,
1643                                  (long unsigned)sizeof(struct ctdb_control_tcp),
1644                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1645                 return -1;
1646         }
1647
1648         addr = tcp_sock->src;
1649         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
1650         addr = tcp_sock->dest;
1651         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1652
1653         ZERO_STRUCT(addr);
1654         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1655         vnn = find_public_ip_vnn(ctdb, &addr);
1656         if (vnn == NULL) {
1657                 switch (addr.sa.sa_family) {
1658                 case AF_INET:
1659                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1660                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
1661                                         ctdb_addr_to_str(&addr)));
1662                         }
1663                         break;
1664                 case AF_INET6:
1665                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
1666                                 ctdb_addr_to_str(&addr)));
1667                         break;
1668                 default:
1669                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1670                 }
1671
1672                 return 0;
1673         }
1674
1675         if (vnn->pnn != ctdb->pnn) {
1676                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1677                         ctdb_addr_to_str(&addr),
1678                         client_id, client->pid));
1679                 /* failing this call will tell smbd to die */
1680                 return -1;
1681         }
1682
1683         ip = talloc(client, struct ctdb_client_ip);
1684         CTDB_NO_MEMORY(ctdb, ip);
1685
1686         ip->ctdb      = ctdb;
1687         ip->addr      = addr;
1688         ip->client_id = client_id;
1689         talloc_set_destructor(ip, ctdb_client_ip_destructor);
1690         DLIST_ADD(ctdb->client_ip_list, ip);
1691
1692         tcp = talloc(client, struct ctdb_tcp_list);
1693         CTDB_NO_MEMORY(ctdb, tcp);
1694
1695         tcp->connection.src_addr = tcp_sock->src;
1696         tcp->connection.dst_addr = tcp_sock->dest;
1697
1698         DLIST_ADD(client->tcp_list, tcp);
1699
1700         t.src_addr = tcp_sock->src;
1701         t.dst_addr = tcp_sock->dest;
1702
1703         data.dptr = (uint8_t *)&t;
1704         data.dsize = sizeof(t);
1705
1706         switch (addr.sa.sa_family) {
1707         case AF_INET:
1708                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1709                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
1710                         ctdb_addr_to_str(&tcp_sock->src),
1711                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1712                 break;
1713         case AF_INET6:
1714                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1715                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
1716                         ctdb_addr_to_str(&tcp_sock->src),
1717                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1718                 break;
1719         default:
1720                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1721         }
1722
1723
1724         /* tell all nodes about this tcp connection */
1725         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
1726                                        CTDB_CONTROL_TCP_ADD,
1727                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1728         if (ret != 0) {
1729                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1730                 return -1;
1731         }
1732
1733         return 0;
1734 }
1735
1736 /*
1737   find a tcp address on a list
1738  */
1739 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
1740                                            struct ctdb_tcp_connection *tcp)
1741 {
1742         int i;
1743
1744         if (array == NULL) {
1745                 return NULL;
1746         }
1747
1748         for (i=0;i<array->num;i++) {
1749                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1750                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1751                         return &array->connections[i];
1752                 }
1753         }
1754         return NULL;
1755 }
1756
1757
1758
1759 /*
1760   called by a daemon to inform us of a TCP connection that one of its
1761   clients managing that should tickled with an ACK when IP takeover is
1762   done
1763  */
1764 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
1765 {
1766         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
1767         struct ctdb_tcp_array *tcparray;
1768         struct ctdb_tcp_connection tcp;
1769         struct ctdb_vnn *vnn;
1770
1771         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
1772         if (vnn == NULL) {
1773                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1774                         ctdb_addr_to_str(&p->dst_addr)));
1775
1776                 return -1;
1777         }
1778
1779
1780         tcparray = vnn->tcp_array;
1781
1782         /* If this is the first tickle */
1783         if (tcparray == NULL) {
1784                 tcparray = talloc_size(ctdb->nodes, 
1785                         offsetof(struct ctdb_tcp_array, connections) +
1786                         sizeof(struct ctdb_tcp_connection) * 1);
1787                 CTDB_NO_MEMORY(ctdb, tcparray);
1788                 vnn->tcp_array = tcparray;
1789
1790                 tcparray->num = 0;
1791                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1792                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1793
1794                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
1795                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1796                 tcparray->num++;
1797
1798                 if (tcp_update_needed) {
1799                         vnn->tcp_update_needed = true;
1800                 }
1801                 return 0;
1802         }
1803
1804
1805         /* Do we already have this tickle ?*/
1806         tcp.src_addr = p->src_addr;
1807         tcp.dst_addr = p->dst_addr;
1808         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1809                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1810                         ctdb_addr_to_str(&tcp.dst_addr),
1811                         ntohs(tcp.dst_addr.ip.sin_port),
1812                         vnn->pnn));
1813                 return 0;
1814         }
1815
1816         /* A new tickle, we must add it to the array */
1817         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1818                                         struct ctdb_tcp_connection,
1819                                         tcparray->num+1);
1820         CTDB_NO_MEMORY(ctdb, tcparray->connections);
1821
1822         vnn->tcp_array = tcparray;
1823         tcparray->connections[tcparray->num].src_addr = p->src_addr;
1824         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
1825         tcparray->num++;
1826                                 
1827         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1828                 ctdb_addr_to_str(&tcp.dst_addr),
1829                 ntohs(tcp.dst_addr.ip.sin_port),
1830                 vnn->pnn));
1831
1832         if (tcp_update_needed) {
1833                 vnn->tcp_update_needed = true;
1834         }
1835
1836         return 0;
1837 }
1838
1839
1840 /*
1841   called by a daemon to inform us of a TCP connection that one of its
1842   clients managing that should tickled with an ACK when IP takeover is
1843   done
1844  */
1845 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1846 {
1847         struct ctdb_tcp_connection *tcpp;
1848         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1849
1850         if (vnn == NULL) {
1851                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1852                         ctdb_addr_to_str(&conn->dst_addr)));
1853                 return;
1854         }
1855
1856         /* if the array is empty we cant remove it
1857            and we dont need to do anything
1858          */
1859         if (vnn->tcp_array == NULL) {
1860                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1861                         ctdb_addr_to_str(&conn->dst_addr),
1862                         ntohs(conn->dst_addr.ip.sin_port)));
1863                 return;
1864         }
1865
1866
1867         /* See if we know this connection
1868            if we dont know this connection  then we dont need to do anything
1869          */
1870         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1871         if (tcpp == NULL) {
1872                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1873                         ctdb_addr_to_str(&conn->dst_addr),
1874                         ntohs(conn->dst_addr.ip.sin_port)));
1875                 return;
1876         }
1877
1878
1879         /* We need to remove this entry from the array.
1880            Instead of allocating a new array and copying data to it
1881            we cheat and just copy the last entry in the existing array
1882            to the entry that is to be removed and just shring the 
1883            ->num field
1884          */
1885         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1886         vnn->tcp_array->num--;
1887
1888         /* If we deleted the last entry we also need to remove the entire array
1889          */
1890         if (vnn->tcp_array->num == 0) {
1891                 talloc_free(vnn->tcp_array);
1892                 vnn->tcp_array = NULL;
1893         }               
1894
1895         vnn->tcp_update_needed = true;
1896
1897         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1898                 ctdb_addr_to_str(&conn->src_addr),
1899                 ntohs(conn->src_addr.ip.sin_port)));
1900 }
1901
1902
1903 /*
1904   called by a daemon to inform us of a TCP connection that one of its
1905   clients used are no longer needed in the tickle database
1906  */
1907 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
1908 {
1909         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
1910
1911         ctdb_remove_tcp_connection(ctdb, conn);
1912
1913         return 0;
1914 }
1915
1916
1917 /*
1918   called when a daemon restarts - send all tickes for all public addresses
1919   we are serving immediately to the new node.
1920  */
1921 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1922 {
1923 /*XXX here we should send all tickes we are serving to the new node */
1924         return 0;
1925 }
1926
1927
1928 /*
1929   called when a client structure goes away - hook to remove
1930   elements from the tcp_list in all daemons
1931  */
1932 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1933 {
1934         while (client->tcp_list) {
1935                 struct ctdb_tcp_list *tcp = client->tcp_list;
1936                 DLIST_REMOVE(client->tcp_list, tcp);
1937                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1938         }
1939 }
1940
1941
1942 /*
1943   release all IPs on shutdown
1944  */
1945 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1946 {
1947         struct ctdb_vnn *vnn;
1948
1949         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1950                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1951                         ctdb_vnn_unassign_iface(ctdb, vnn);
1952                         continue;
1953                 }
1954                 if (!vnn->iface) {
1955                         continue;
1956                 }
1957                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1958                                   ctdb_vnn_iface_string(vnn),
1959                                   ctdb_addr_to_str(&vnn->public_address),
1960                                   vnn->public_netmask_bits);
1961                 release_kill_clients(ctdb, &vnn->public_address);
1962                 ctdb_vnn_unassign_iface(ctdb, vnn);
1963         }
1964 }
1965
1966
1967 /*
1968   get list of public IPs
1969  */
1970 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
1971                                     struct ctdb_req_control *c, TDB_DATA *outdata)
1972 {
1973         int i, num, len;
1974         struct ctdb_all_public_ips *ips;
1975         struct ctdb_vnn *vnn;
1976         bool only_available = false;
1977
1978         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1979                 only_available = true;
1980         }
1981
1982         /* count how many public ip structures we have */
1983         num = 0;
1984         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1985                 num++;
1986         }
1987
1988         len = offsetof(struct ctdb_all_public_ips, ips) + 
1989                 num*sizeof(struct ctdb_public_ip);
1990         ips = talloc_zero_size(outdata, len);
1991         CTDB_NO_MEMORY(ctdb, ips);
1992
1993         i = 0;
1994         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1995                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1996                         continue;
1997                 }
1998                 ips->ips[i].pnn  = vnn->pnn;
1999                 ips->ips[i].addr = vnn->public_address;
2000                 i++;
2001         }
2002         ips->num = i;
2003         len = offsetof(struct ctdb_all_public_ips, ips) +
2004                 i*sizeof(struct ctdb_public_ip);
2005
2006         outdata->dsize = len;
2007         outdata->dptr  = (uint8_t *)ips;
2008
2009         return 0;
2010 }
2011
2012
2013 /*
2014   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2015  */
2016 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2017                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2018 {
2019         int i, num, len;
2020         struct ctdb_all_public_ipsv4 *ips;
2021         struct ctdb_vnn *vnn;
2022
2023         /* count how many public ip structures we have */
2024         num = 0;
2025         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2026                 if (vnn->public_address.sa.sa_family != AF_INET) {
2027                         continue;
2028                 }
2029                 num++;
2030         }
2031
2032         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2033                 num*sizeof(struct ctdb_public_ipv4);
2034         ips = talloc_zero_size(outdata, len);
2035         CTDB_NO_MEMORY(ctdb, ips);
2036
2037         outdata->dsize = len;
2038         outdata->dptr  = (uint8_t *)ips;
2039
2040         ips->num = num;
2041         i = 0;
2042         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2043                 if (vnn->public_address.sa.sa_family != AF_INET) {
2044                         continue;
2045                 }
2046                 ips->ips[i].pnn = vnn->pnn;
2047                 ips->ips[i].sin = vnn->public_address.ip;
2048                 i++;
2049         }
2050
2051         return 0;
2052 }
2053
2054 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2055                                         struct ctdb_req_control *c,
2056                                         TDB_DATA indata,
2057                                         TDB_DATA *outdata)
2058 {
2059         int i, num, len;
2060         ctdb_sock_addr *addr;
2061         struct ctdb_control_public_ip_info *info;
2062         struct ctdb_vnn *vnn;
2063
2064         addr = (ctdb_sock_addr *)indata.dptr;
2065
2066         vnn = find_public_ip_vnn(ctdb, addr);
2067         if (vnn == NULL) {
2068                 /* if it is not a public ip   it could be our 'single ip' */
2069                 if (ctdb->single_ip_vnn) {
2070                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2071                                 vnn = ctdb->single_ip_vnn;
2072                         }
2073                 }
2074         }
2075         if (vnn == NULL) {
2076                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2077                                  "'%s'not a public address\n",
2078                                  ctdb_addr_to_str(addr)));
2079                 return -1;
2080         }
2081
2082         /* count how many public ip structures we have */
2083         num = 0;
2084         for (;vnn->ifaces[num];) {
2085                 num++;
2086         }
2087
2088         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2089                 num*sizeof(struct ctdb_control_iface_info);
2090         info = talloc_zero_size(outdata, len);
2091         CTDB_NO_MEMORY(ctdb, info);
2092
2093         info->ip.addr = vnn->public_address;
2094         info->ip.pnn = vnn->pnn;
2095         info->active_idx = 0xFFFFFFFF;
2096
2097         for (i=0; vnn->ifaces[i]; i++) {
2098                 struct ctdb_iface *cur;
2099
2100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2101                 if (cur == NULL) {
2102                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2103                                            vnn->ifaces[i]));
2104                         return -1;
2105                 }
2106                 if (vnn->iface == cur) {
2107                         info->active_idx = i;
2108                 }
2109                 strcpy(info->ifaces[i].name, cur->name);
2110                 info->ifaces[i].link_state = cur->link_up;
2111                 info->ifaces[i].references = cur->references;
2112         }
2113         info->num = i;
2114         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2115                 i*sizeof(struct ctdb_control_iface_info);
2116
2117         outdata->dsize = len;
2118         outdata->dptr  = (uint8_t *)info;
2119
2120         return 0;
2121 }
2122
2123 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2124                                 struct ctdb_req_control *c,
2125                                 TDB_DATA *outdata)
2126 {
2127         int i, num, len;
2128         struct ctdb_control_get_ifaces *ifaces;
2129         struct ctdb_iface *cur;
2130
2131         /* count how many public ip structures we have */
2132         num = 0;
2133         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2134                 num++;
2135         }
2136
2137         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2138                 num*sizeof(struct ctdb_control_iface_info);
2139         ifaces = talloc_zero_size(outdata, len);
2140         CTDB_NO_MEMORY(ctdb, ifaces);
2141
2142         i = 0;
2143         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2144                 strcpy(ifaces->ifaces[i].name, cur->name);
2145                 ifaces->ifaces[i].link_state = cur->link_up;
2146                 ifaces->ifaces[i].references = cur->references;
2147                 i++;
2148         }
2149         ifaces->num = i;
2150         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2151                 i*sizeof(struct ctdb_control_iface_info);
2152
2153         outdata->dsize = len;
2154         outdata->dptr  = (uint8_t *)ifaces;
2155
2156         return 0;
2157 }
2158
2159 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2160                                     struct ctdb_req_control *c,
2161                                     TDB_DATA indata)
2162 {
2163         struct ctdb_control_iface_info *info;
2164         struct ctdb_iface *iface;
2165         bool link_up = false;
2166
2167         info = (struct ctdb_control_iface_info *)indata.dptr;
2168
2169         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2170                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2171                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2172                                   len, len, info->name));
2173                 return -1;
2174         }
2175
2176         switch (info->link_state) {
2177         case 0:
2178                 link_up = false;
2179                 break;
2180         case 1:
2181                 link_up = true;
2182                 break;
2183         default:
2184                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2185                                   (unsigned int)info->link_state));
2186                 return -1;
2187         }
2188
2189         if (info->references != 0) {
2190                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2191                                   (unsigned int)info->references));
2192                 return -1;
2193         }
2194
2195         iface = ctdb_find_iface(ctdb, info->name);
2196         if (iface == NULL) {
2197                 DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
2198                                   info->name));
2199                 return -1;
2200         }
2201
2202         if (link_up == iface->link_up) {
2203                 return 0;
2204         }
2205
2206         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2207               ("iface[%s] has changed it's link status %s => %s\n",
2208                iface->name,
2209                iface->link_up?"up":"down",
2210                link_up?"up":"down"));
2211
2212         iface->link_up = link_up;
2213         return 0;
2214 }
2215
2216
2217 /* 
2218    structure containing the listening socket and the list of tcp connections
2219    that the ctdb daemon is to kill
2220 */
2221 struct ctdb_kill_tcp {
2222         struct ctdb_vnn *vnn;
2223         struct ctdb_context *ctdb;
2224         int capture_fd;
2225         struct fd_event *fde;
2226         trbt_tree_t *connections;
2227         void *private_data;
2228 };
2229
2230 /*
2231   a tcp connection that is to be killed
2232  */
2233 struct ctdb_killtcp_con {
2234         ctdb_sock_addr src_addr;
2235         ctdb_sock_addr dst_addr;
2236         int count;
2237         struct ctdb_kill_tcp *killtcp;
2238 };
2239
2240 /* this function is used to create a key to represent this socketpair
2241    in the killtcp tree.
2242    this key is used to insert and lookup matching socketpairs that are
2243    to be tickled and RST
2244 */
2245 #define KILLTCP_KEYLEN  10
2246 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2247 {
2248         static uint32_t key[KILLTCP_KEYLEN];
2249
2250         bzero(key, sizeof(key));
2251
2252         if (src->sa.sa_family != dst->sa.sa_family) {
2253                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2254                 return key;
2255         }
2256         
2257         switch (src->sa.sa_family) {
2258         case AF_INET:
2259                 key[0]  = dst->ip.sin_addr.s_addr;
2260                 key[1]  = src->ip.sin_addr.s_addr;
2261                 key[2]  = dst->ip.sin_port;
2262                 key[3]  = src->ip.sin_port;
2263                 break;
2264         case AF_INET6:
2265                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2266                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2267                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2268                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2269                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2270                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2271                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2272                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2273                 key[8]  = dst->ip6.sin6_port;
2274                 key[9]  = src->ip6.sin6_port;
2275                 break;
2276         default:
2277                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2278                 return key;
2279         }
2280
2281         return key;
2282 }
2283
2284 /*
2285   called when we get a read event on the raw socket
2286  */
2287 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2288                                 uint16_t flags, void *private_data)
2289 {
2290         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2291         struct ctdb_killtcp_con *con;
2292         ctdb_sock_addr src, dst;
2293         uint32_t ack_seq, seq;
2294
2295         if (!(flags & EVENT_FD_READ)) {
2296                 return;
2297         }
2298
2299         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2300                                 killtcp->private_data,
2301                                 &src, &dst,
2302                                 &ack_seq, &seq) != 0) {
2303                 /* probably a non-tcp ACK packet */
2304                 return;
2305         }
2306
2307         /* check if we have this guy in our list of connections
2308            to kill
2309         */
2310         con = trbt_lookuparray32(killtcp->connections, 
2311                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2312         if (con == NULL) {
2313                 /* no this was some other packet we can just ignore */
2314                 return;
2315         }
2316
2317         /* This one has been tickled !
2318            now reset him and remove him from the list.
2319          */
2320         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2321                 ntohs(con->dst_addr.ip.sin_port),
2322                 ctdb_addr_to_str(&con->src_addr),
2323                 ntohs(con->src_addr.ip.sin_port)));
2324
2325         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2326         talloc_free(con);
2327 }
2328
2329
2330 /* when traversing the list of all tcp connections to send tickle acks to
2331    (so that we can capture the ack coming back and kill the connection
2332     by a RST)
2333    this callback is called for each connection we are currently trying to kill
2334 */
2335 static void tickle_connection_traverse(void *param, void *data)
2336 {
2337         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2338
2339         /* have tried too many times, just give up */
2340         if (con->count >= 5) {
2341                 /* can't delete in traverse: reparent to delete_cons */
2342                 talloc_steal(param, con);
2343                 return;
2344         }
2345
2346         /* othervise, try tickling it again */
2347         con->count++;
2348         ctdb_sys_send_tcp(
2349                 (ctdb_sock_addr *)&con->dst_addr,
2350                 (ctdb_sock_addr *)&con->src_addr,
2351                 0, 0, 0);
2352 }
2353
2354
2355 /* 
2356    called every second until all sentenced connections have been reset
2357  */
2358 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2359                                               struct timeval t, void *private_data)
2360 {
2361         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2362         void *delete_cons = talloc_new(NULL);
2363
2364         /* loop over all connections sending tickle ACKs */
2365         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2366
2367         /* now we've finished traverse, it's safe to do deletion. */
2368         talloc_free(delete_cons);
2369
2370         /* If there are no more connections to kill we can remove the
2371            entire killtcp structure
2372          */
2373         if ( (killtcp->connections == NULL) || 
2374              (killtcp->connections->root == NULL) ) {
2375                 talloc_free(killtcp);
2376                 return;
2377         }
2378
2379         /* try tickling them again in a seconds time
2380          */
2381         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2382                         ctdb_tickle_sentenced_connections, killtcp);
2383 }
2384
2385 /*
2386   destroy the killtcp structure
2387  */
2388 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2389 {
2390         if (killtcp->vnn) {
2391                 killtcp->vnn->killtcp = NULL;
2392         }
2393         return 0;
2394 }
2395
2396
2397 /* nothing fancy here, just unconditionally replace any existing
2398    connection structure with the new one.
2399
2400    dont even free the old one if it did exist, that one is talloc_stolen
2401    by the same node in the tree anyway and will be deleted when the new data 
2402    is deleted
2403 */
2404 static void *add_killtcp_callback(void *parm, void *data)
2405 {
2406         return parm;
2407 }
2408
2409 /*
2410   add a tcp socket to the list of connections we want to RST
2411  */
2412 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2413                                        ctdb_sock_addr *s,
2414                                        ctdb_sock_addr *d)
2415 {
2416         ctdb_sock_addr src, dst;
2417         struct ctdb_kill_tcp *killtcp;
2418         struct ctdb_killtcp_con *con;
2419         struct ctdb_vnn *vnn;
2420
2421         ctdb_canonicalize_ip(s, &src);
2422         ctdb_canonicalize_ip(d, &dst);
2423
2424         vnn = find_public_ip_vnn(ctdb, &dst);
2425         if (vnn == NULL) {
2426                 vnn = find_public_ip_vnn(ctdb, &src);
2427         }
2428         if (vnn == NULL) {
2429                 /* if it is not a public ip   it could be our 'single ip' */
2430                 if (ctdb->single_ip_vnn) {
2431                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2432                                 vnn = ctdb->single_ip_vnn;
2433                         }
2434                 }
2435         }
2436         if (vnn == NULL) {
2437                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2438                 return -1;
2439         }
2440
2441         killtcp = vnn->killtcp;
2442         
2443         /* If this is the first connection to kill we must allocate
2444            a new structure
2445          */
2446         if (killtcp == NULL) {
2447                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2448                 CTDB_NO_MEMORY(ctdb, killtcp);
2449
2450                 killtcp->vnn         = vnn;
2451                 killtcp->ctdb        = ctdb;
2452                 killtcp->capture_fd  = -1;
2453                 killtcp->connections = trbt_create(killtcp, 0);
2454
2455                 vnn->killtcp         = killtcp;
2456                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2457         }
2458
2459
2460
2461         /* create a structure that describes this connection we want to
2462            RST and store it in killtcp->connections
2463         */
2464         con = talloc(killtcp, struct ctdb_killtcp_con);
2465         CTDB_NO_MEMORY(ctdb, con);
2466         con->src_addr = src;
2467         con->dst_addr = dst;
2468         con->count    = 0;
2469         con->killtcp  = killtcp;
2470
2471
2472         trbt_insertarray32_callback(killtcp->connections,
2473                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2474                         add_killtcp_callback, con);
2475
2476         /* 
2477            If we dont have a socket to listen on yet we must create it
2478          */
2479         if (killtcp->capture_fd == -1) {
2480                 const char *iface = ctdb_vnn_iface_string(vnn);
2481                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2482                 if (killtcp->capture_fd == -1) {
2483                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2484                                           "socket on iface '%s' for killtcp (%s)\n",
2485                                           iface, strerror(errno)));
2486                         goto failed;
2487                 }
2488         }
2489
2490
2491         if (killtcp->fde == NULL) {
2492                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2493                                             EVENT_FD_READ,
2494                                             capture_tcp_handler, killtcp);
2495                 tevent_fd_set_auto_close(killtcp->fde);
2496
2497                 /* We also need to set up some events to tickle all these connections
2498                    until they are all reset
2499                 */
2500                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2501                                 ctdb_tickle_sentenced_connections, killtcp);
2502         }
2503
2504         /* tickle him once now */
2505         ctdb_sys_send_tcp(
2506                 &con->dst_addr,
2507                 &con->src_addr,
2508                 0, 0, 0);
2509
2510         return 0;
2511
2512 failed:
2513         talloc_free(vnn->killtcp);
2514         vnn->killtcp = NULL;
2515         return -1;
2516 }
2517
2518 /*
2519   kill a TCP connection.
2520  */
2521 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2522 {
2523         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2524
2525         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2526 }
2527
2528 /*
2529   called by a daemon to inform us of the entire list of TCP tickles for
2530   a particular public address.
2531   this control should only be sent by the node that is currently serving
2532   that public address.
2533  */
2534 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2535 {
2536         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2537         struct ctdb_tcp_array *tcparray;
2538         struct ctdb_vnn *vnn;
2539
2540         /* We must at least have tickles.num or else we cant verify the size
2541            of the received data blob
2542          */
2543         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2544                                         tickles.connections)) {
2545                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2546                 return -1;
2547         }
2548
2549         /* verify that the size of data matches what we expect */
2550         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2551                                 tickles.connections)
2552                          + sizeof(struct ctdb_tcp_connection)
2553                                  * list->tickles.num) {
2554                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2555                 return -1;
2556         }       
2557
2558         vnn = find_public_ip_vnn(ctdb, &list->addr);
2559         if (vnn == NULL) {
2560                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2561                         ctdb_addr_to_str(&list->addr)));
2562
2563                 return 1;
2564         }
2565
2566         /* remove any old ticklelist we might have */
2567         talloc_free(vnn->tcp_array);
2568         vnn->tcp_array = NULL;
2569
2570         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2571         CTDB_NO_MEMORY(ctdb, tcparray);
2572
2573         tcparray->num = list->tickles.num;
2574
2575         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2576         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2577
2578         memcpy(tcparray->connections, &list->tickles.connections[0], 
2579                sizeof(struct ctdb_tcp_connection)*tcparray->num);
2580
2581         /* We now have a new fresh tickle list array for this vnn */
2582         vnn->tcp_array = talloc_steal(vnn, tcparray);
2583         
2584         return 0;
2585 }
2586
2587 /*
2588   called to return the full list of tickles for the puclic address associated 
2589   with the provided vnn
2590  */
2591 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2592 {
2593         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2594         struct ctdb_control_tcp_tickle_list *list;
2595         struct ctdb_tcp_array *tcparray;
2596         int num;
2597         struct ctdb_vnn *vnn;
2598
2599         vnn = find_public_ip_vnn(ctdb, addr);
2600         if (vnn == NULL) {
2601                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
2602                         ctdb_addr_to_str(addr)));
2603
2604                 return 1;
2605         }
2606
2607         tcparray = vnn->tcp_array;
2608         if (tcparray) {
2609                 num = tcparray->num;
2610         } else {
2611                 num = 0;
2612         }
2613
2614         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2615                                 tickles.connections)
2616                         + sizeof(struct ctdb_tcp_connection) * num;
2617
2618         outdata->dptr  = talloc_size(outdata, outdata->dsize);
2619         CTDB_NO_MEMORY(ctdb, outdata->dptr);
2620         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2621
2622         list->addr = *addr;
2623         list->tickles.num = num;
2624         if (num) {
2625                 memcpy(&list->tickles.connections[0], tcparray->connections, 
2626                         sizeof(struct ctdb_tcp_connection) * num);
2627         }
2628
2629         return 0;
2630 }
2631
2632
2633 /*
2634   set the list of all tcp tickles for a public address
2635  */
2636 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
2637                               struct timeval timeout, uint32_t destnode, 
2638                               ctdb_sock_addr *addr,
2639                               struct ctdb_tcp_array *tcparray)
2640 {
2641         int ret, num;
2642         TDB_DATA data;
2643         struct ctdb_control_tcp_tickle_list *list;
2644
2645         if (tcparray) {
2646                 num = tcparray->num;
2647         } else {
2648                 num = 0;
2649         }
2650
2651         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
2652                                 tickles.connections) +
2653                         sizeof(struct ctdb_tcp_connection) * num;
2654         data.dptr = talloc_size(ctdb, data.dsize);
2655         CTDB_NO_MEMORY(ctdb, data.dptr);
2656
2657         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2658         list->addr = *addr;
2659         list->tickles.num = num;
2660         if (tcparray) {
2661                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2662         }
2663
2664         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2665                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2666                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2667         if (ret != 0) {
2668                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2669                 return -1;
2670         }
2671
2672         talloc_free(data.dptr);
2673
2674         return ret;
2675 }
2676
2677
2678 /*
2679   perform tickle updates if required
2680  */
2681 static void ctdb_update_tcp_tickles(struct event_context *ev, 
2682                                 struct timed_event *te, 
2683                                 struct timeval t, void *private_data)
2684 {
2685         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2686         int ret;
2687         struct ctdb_vnn *vnn;
2688
2689         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2690                 /* we only send out updates for public addresses that 
2691                    we have taken over
2692                  */
2693                 if (ctdb->pnn != vnn->pnn) {
2694                         continue;
2695                 }
2696                 /* We only send out the updates if we need to */
2697                 if (!vnn->tcp_update_needed) {
2698                         continue;
2699                 }
2700                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
2701                                 TAKEOVER_TIMEOUT(),
2702                                 CTDB_BROADCAST_CONNECTED,
2703                                 &vnn->public_address,
2704                                 vnn->tcp_array);
2705                 if (ret != 0) {
2706                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2707                                 ctdb_addr_to_str(&vnn->public_address)));
2708                 }
2709         }
2710
2711         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2712                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2713                              ctdb_update_tcp_tickles, ctdb);
2714 }               
2715         
2716
2717 /*
2718   start periodic update of tcp tickles
2719  */
2720 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2721 {
2722         ctdb->tickle_update_context = talloc_new(ctdb);
2723
2724         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2725                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
2726                              ctdb_update_tcp_tickles, ctdb);
2727 }
2728
2729
2730
2731
2732 struct control_gratious_arp {
2733         struct ctdb_context *ctdb;
2734         ctdb_sock_addr addr;
2735         const char *iface;
2736         int count;
2737 };
2738
2739 /*
2740   send a control_gratuitous arp
2741  */
2742 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
2743                                   struct timeval t, void *private_data)
2744 {
2745         int ret;
2746         struct control_gratious_arp *arp = talloc_get_type(private_data, 
2747                                                         struct control_gratious_arp);
2748
2749         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2750         if (ret != 0) {
2751                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2752                                  arp->iface, strerror(errno)));
2753         }
2754
2755
2756         arp->count++;
2757         if (arp->count == CTDB_ARP_REPEAT) {
2758                 talloc_free(arp);
2759                 return;
2760         }
2761
2762         event_add_timed(arp->ctdb->ev, arp, 
2763                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
2764                         send_gratious_arp, arp);
2765 }
2766
2767
2768 /*
2769   send a gratious arp 
2770  */
2771 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2772 {
2773         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2774         struct control_gratious_arp *arp;
2775
2776         /* verify the size of indata */
2777         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2778                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
2779                                  (unsigned)indata.dsize, 
2780                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2781                 return -1;
2782         }
2783         if (indata.dsize != 
2784                 ( offsetof(struct ctdb_control_gratious_arp, iface)
2785                 + gratious_arp->len ) ){
2786
2787                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2788                         "but should be %u bytes\n", 
2789                          (unsigned)indata.dsize, 
2790                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2791                 return -1;
2792         }
2793
2794
2795         arp = talloc(ctdb, struct control_gratious_arp);
2796         CTDB_NO_MEMORY(ctdb, arp);
2797
2798         arp->ctdb  = ctdb;
2799         arp->addr   = gratious_arp->addr;
2800         arp->iface = talloc_strdup(arp, gratious_arp->iface);
2801         CTDB_NO_MEMORY(ctdb, arp->iface);
2802         arp->count = 0;
2803         
2804         event_add_timed(arp->ctdb->ev, arp, 
2805                         timeval_zero(), send_gratious_arp, arp);
2806
2807         return 0;
2808 }
2809
2810 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2811 {
2812         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2813         int ret;
2814
2815         /* verify the size of indata */
2816         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2817                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2818                 return -1;
2819         }
2820         if (indata.dsize != 
2821                 ( offsetof(struct ctdb_control_ip_iface, iface)
2822                 + pub->len ) ){
2823
2824                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2825                         "but should be %u bytes\n", 
2826                          (unsigned)indata.dsize, 
2827                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2828                 return -1;
2829         }
2830
2831         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2832
2833         if (ret != 0) {
2834                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2835                 return -1;
2836         }
2837
2838         return 0;
2839 }
2840
2841 /*
2842   called when releaseip event finishes for del_public_address
2843  */
2844 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
2845                                 void *private_data)
2846 {
2847         talloc_free(private_data);
2848 }
2849
2850 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2851 {
2852         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2853         struct ctdb_vnn *vnn;
2854         int ret;
2855
2856         /* verify the size of indata */
2857         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2858                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2859                 return -1;
2860         }
2861         if (indata.dsize != 
2862                 ( offsetof(struct ctdb_control_ip_iface, iface)
2863                 + pub->len ) ){
2864
2865                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2866                         "but should be %u bytes\n", 
2867                          (unsigned)indata.dsize, 
2868                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2869                 return -1;
2870         }
2871
2872         /* walk over all public addresses until we find a match */
2873         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2874                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2875                         TALLOC_CTX *mem_ctx;
2876
2877                         DLIST_REMOVE(ctdb->vnn, vnn);
2878                         if (vnn->iface == NULL) {
2879                                 talloc_free(vnn);
2880                                 return 0;
2881                         }
2882
2883                         mem_ctx = talloc_new(ctdb);
2884                         ret = ctdb_event_script_callback(ctdb, 
2885                                          mem_ctx, delete_ip_callback, mem_ctx,
2886                                          false,
2887                                          CTDB_EVENT_RELEASE_IP,
2888                                          "%s %s %u",
2889                                          ctdb_vnn_iface_string(vnn),
2890                                          ctdb_addr_to_str(&vnn->public_address),
2891                                          vnn->public_netmask_bits);
2892                         ctdb_vnn_unassign_iface(ctdb, vnn);
2893                         talloc_free(vnn);
2894                         if (ret != 0) {
2895                                 return -1;
2896                         }
2897                         return 0;
2898                 }
2899         }
2900
2901         return -1;
2902 }
2903
2904 /* This function is called from the recovery daemon to verify that a remote
2905    node has the expected ip allocation.
2906    This is verified against ctdb->ip_tree
2907 */
2908 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
2909 {
2910         struct ctdb_public_ip_list *tmp_ip; 
2911         int i;
2912
2913         if (ctdb->ip_tree == NULL) {
2914                 /* dont know the expected allocation yet, assume remote node
2915                    is correct. */
2916                 return 0;
2917         }
2918
2919         if (ips == NULL) {
2920                 return 0;
2921         }
2922
2923         for (i=0; i<ips->num; i++) {
2924                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
2925                 if (tmp_ip == NULL) {
2926                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
2927                         return -1;
2928                 }
2929
2930                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
2931                         continue;
2932                 }
2933
2934                 if (tmp_ip->pnn != ips->ips[i].pnn) {
2935                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
2936                         return -1;
2937                 }
2938         }
2939
2940         return 0;
2941 }
2942
2943 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
2944 {
2945         struct ctdb_public_ip_list *tmp_ip; 
2946
2947         if (ctdb->ip_tree == NULL) {
2948                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
2949                 return -1;
2950         }
2951
2952         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
2953         if (tmp_ip == NULL) {
2954                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
2955                 return -1;
2956         }
2957
2958         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
2959         tmp_ip->pnn = ip->pnn;
2960
2961         return 0;
2962 }