IP allocation - add LCP2 algorithm.
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         /* Verify that we dont have an entry for this ip yet */
884         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
885                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
886                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
887                                 ctdb_addr_to_str(addr)));
888                         return -1;
889                 }               
890         }
891
892         /* create a new vnn structure for this ip address */
893         vnn = talloc_zero(ctdb, struct ctdb_vnn);
894         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
895         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
896         tmp = talloc_strdup(vnn, ifaces);
897         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
898         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
899                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
900                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
901                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
902                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
903                 num++;
904         }
905         talloc_free(tmp);
906         vnn->ifaces[num] = NULL;
907         vnn->public_address      = *addr;
908         vnn->public_netmask_bits = mask;
909         vnn->pnn                 = -1;
910         if (ctdb_sys_have_ip(addr)) {
911                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
912                 vnn->pnn = ctdb->pnn;
913         }
914
915         for (i=0; vnn->ifaces[i]; i++) {
916                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
917                 if (ret != 0) {
918                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
919                                            "for public_address[%s]\n",
920                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
921                         talloc_free(vnn);
922                         return -1;
923                 }
924                 if (i == 0) {
925                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
926                 }
927         }
928
929         DLIST_ADD(ctdb->vnn, vnn);
930
931         return 0;
932 }
933
934 /*
935   setup the event script directory
936 */
937 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
938 {
939         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
940         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
941         return 0;
942 }
943
944 /*
945   setup the public address lists from a file
946 */
947 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
948 {
949         char **lines;
950         int nlines;
951         int i;
952
953         lines = file_lines_load(alist, &nlines, ctdb);
954         if (lines == NULL) {
955                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
956                 return -1;
957         }
958         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
959                 nlines--;
960         }
961
962         for (i=0;i<nlines;i++) {
963                 unsigned mask;
964                 ctdb_sock_addr addr;
965                 const char *addrstr;
966                 const char *ifaces;
967                 char *tok, *line;
968
969                 line = lines[i];
970                 while ((*line == ' ') || (*line == '\t')) {
971                         line++;
972                 }
973                 if (*line == '#') {
974                         continue;
975                 }
976                 if (strcmp(line, "") == 0) {
977                         continue;
978                 }
979                 tok = strtok(line, " \t");
980                 addrstr = tok;
981                 tok = strtok(NULL, " \t");
982                 if (tok == NULL) {
983                         if (NULL == ctdb->default_public_interface) {
984                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
985                                          i+1));
986                                 talloc_free(lines);
987                                 return -1;
988                         }
989                         ifaces = ctdb->default_public_interface;
990                 } else {
991                         ifaces = tok;
992                 }
993
994                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
995                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
996                         talloc_free(lines);
997                         return -1;
998                 }
999                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1000                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1001                         talloc_free(lines);
1002                         return -1;
1003                 }
1004         }
1005
1006         talloc_free(lines);
1007         return 0;
1008 }
1009
1010 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1011                               const char *iface,
1012                               const char *ip)
1013 {
1014         struct ctdb_vnn *svnn;
1015         struct ctdb_iface *cur = NULL;
1016         bool ok;
1017         int ret;
1018
1019         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1020         CTDB_NO_MEMORY(ctdb, svnn);
1021
1022         svnn->ifaces = talloc_array(svnn, const char *, 2);
1023         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1024         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1025         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1026         svnn->ifaces[1] = NULL;
1027
1028         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1029         if (!ok) {
1030                 talloc_free(svnn);
1031                 return -1;
1032         }
1033
1034         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1035         if (ret != 0) {
1036                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1037                                    "for single_ip[%s]\n",
1038                                    svnn->ifaces[0],
1039                                    ctdb_addr_to_str(&svnn->public_address)));
1040                 talloc_free(svnn);
1041                 return -1;
1042         }
1043
1044         /* assume the single public ip interface is initially "good" */
1045         cur = ctdb_find_iface(ctdb, iface);
1046         if (cur == NULL) {
1047                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1048                 return -1;
1049         }
1050         cur->link_up = true;
1051
1052         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1053         if (ret != 0) {
1054                 talloc_free(svnn);
1055                 return -1;
1056         }
1057
1058         ctdb->single_ip_vnn = svnn;
1059         return 0;
1060 }
1061
1062 struct ctdb_public_ip_list {
1063         struct ctdb_public_ip_list *next;
1064         uint32_t pnn;
1065         ctdb_sock_addr addr;
1066 };
1067
1068
1069 /* Given a physical node, return the number of
1070    public addresses that is currently assigned to this node.
1071 */
1072 static int node_ip_coverage(struct ctdb_context *ctdb, 
1073         int32_t pnn,
1074         struct ctdb_public_ip_list *ips)
1075 {
1076         int num=0;
1077
1078         for (;ips;ips=ips->next) {
1079                 if (ips->pnn == pnn) {
1080                         num++;
1081                 }
1082         }
1083         return num;
1084 }
1085
1086
1087 /* Check if this is a public ip known to the node, i.e. can that
1088    node takeover this ip ?
1089 */
1090 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1091                 struct ctdb_public_ip_list *ip)
1092 {
1093         struct ctdb_all_public_ips *public_ips;
1094         int i;
1095
1096         public_ips = ctdb->nodes[pnn]->available_public_ips;
1097
1098         if (public_ips == NULL) {
1099                 return -1;
1100         }
1101
1102         for (i=0;i<public_ips->num;i++) {
1103                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1104                         /* yes, this node can serve this public ip */
1105                         return 0;
1106                 }
1107         }
1108
1109         return -1;
1110 }
1111
1112
1113 /* search the node lists list for a node to takeover this ip.
1114    pick the node that currently are serving the least number of ips
1115    so that the ips get spread out evenly.
1116 */
1117 static int find_takeover_node(struct ctdb_context *ctdb, 
1118                 struct ctdb_node_map *nodemap, uint32_t mask, 
1119                 struct ctdb_public_ip_list *ip,
1120                 struct ctdb_public_ip_list *all_ips)
1121 {
1122         int pnn, min=0, num;
1123         int i;
1124
1125         pnn    = -1;
1126         for (i=0;i<nodemap->num;i++) {
1127                 if (nodemap->nodes[i].flags & mask) {
1128                         /* This node is not healty and can not be used to serve
1129                            a public address 
1130                         */
1131                         continue;
1132                 }
1133
1134                 /* verify that this node can serve this ip */
1135                 if (can_node_serve_ip(ctdb, i, ip)) {
1136                         /* no it couldnt   so skip to the next node */
1137                         continue;
1138                 }
1139
1140                 num = node_ip_coverage(ctdb, i, all_ips);
1141                 /* was this the first node we checked ? */
1142                 if (pnn == -1) {
1143                         pnn = i;
1144                         min  = num;
1145                 } else {
1146                         if (num < min) {
1147                                 pnn = i;
1148                                 min  = num;
1149                         }
1150                 }
1151         }       
1152         if (pnn == -1) {
1153                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1154                         ctdb_addr_to_str(&ip->addr)));
1155
1156                 return -1;
1157         }
1158
1159         ip->pnn = pnn;
1160         return 0;
1161 }
1162
1163 #define IP_KEYLEN       4
1164 static uint32_t *ip_key(ctdb_sock_addr *ip)
1165 {
1166         static uint32_t key[IP_KEYLEN];
1167
1168         bzero(key, sizeof(key));
1169
1170         switch (ip->sa.sa_family) {
1171         case AF_INET:
1172                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1173                 break;
1174         case AF_INET6:
1175                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1176                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1177                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1178                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1179                 break;
1180         default:
1181                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1182                 return key;
1183         }
1184
1185         return key;
1186 }
1187
1188 static void *add_ip_callback(void *parm, void *data)
1189 {
1190         struct ctdb_public_ip_list *this_ip = parm; 
1191         struct ctdb_public_ip_list *prev_ip = data; 
1192
1193         if (prev_ip == NULL) {
1194                 return parm;
1195         }
1196         if (this_ip->pnn == -1) {
1197                 this_ip->pnn = prev_ip->pnn;
1198         }
1199
1200         return parm;
1201 }
1202
1203 void getips_count_callback(void *param, void *data)
1204 {
1205         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1206         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1207
1208         new_ip->next = *ip_list;
1209         *ip_list     = new_ip;
1210 }
1211
1212 static struct ctdb_public_ip_list *
1213 create_merged_ip_list(struct ctdb_context *ctdb)
1214 {
1215         int i, j;
1216         struct ctdb_public_ip_list *ip_list;
1217         struct ctdb_all_public_ips *public_ips;
1218
1219         if (ctdb->ip_tree != NULL) {
1220                 talloc_free(ctdb->ip_tree);
1221                 ctdb->ip_tree = NULL;
1222         }
1223         ctdb->ip_tree = trbt_create(ctdb, 0);
1224
1225         for (i=0;i<ctdb->num_nodes;i++) {
1226                 public_ips = ctdb->nodes[i]->known_public_ips;
1227
1228                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1229                         continue;
1230                 }
1231
1232                 /* there were no public ips for this node */
1233                 if (public_ips == NULL) {
1234                         continue;
1235                 }               
1236
1237                 for (j=0;j<public_ips->num;j++) {
1238                         struct ctdb_public_ip_list *tmp_ip; 
1239
1240                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1241                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1242                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1243                         tmp_ip->addr = public_ips->ips[j].addr;
1244                         tmp_ip->next = NULL;
1245
1246                         trbt_insertarray32_callback(ctdb->ip_tree,
1247                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1248                                 add_ip_callback,
1249                                 tmp_ip);
1250                 }
1251         }
1252
1253         ip_list = NULL;
1254         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1255
1256         return ip_list;
1257 }
1258
1259 /* 
1260  * This is the length of the longtest common prefix between the IPs.
1261  * It is calculated by XOR-ing the 2 IPs together and counting the
1262  * number of leading zeroes.  The implementation means that all
1263  * addresses end up being 128 bits long.
1264  * Not static, so we can easily link it into a unit test.
1265  *
1266  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1267  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1268  * lots of nodes and IP addresses?
1269  */
1270 uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1271 {
1272         uint32_t ip1_k[IP_KEYLEN];
1273         uint32_t *t;
1274         int i;
1275         uint32_t x;
1276
1277         uint32_t distance = 0;
1278
1279         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1280         t = ip_key(ip2);
1281         for (i=0; i<IP_KEYLEN; i++) {
1282                 x = ip1_k[i] ^ t[i];
1283                 if (x == 0) {
1284                         distance += 32;
1285                 } else {
1286                         /* Count number of leading zeroes. 
1287                          * FIXME? This could be optimised...
1288                          */
1289                         while ((x & (1 << 31)) == 0) {
1290                                 x <<= 1;
1291                                 distance += 1;
1292                         }
1293                 }
1294         }
1295
1296         return distance;
1297 }
1298
1299 /* Calculate the IP distance for the given IP relative to IPs on the
1300    given node.  The ips argument is generally the all_ips variable
1301    used in the main part of the algorithm.
1302  * Not static, so we can easily link it into a unit test.
1303  */
1304 uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1305                            struct ctdb_public_ip_list *ips,
1306                            int pnn)
1307 {
1308         struct ctdb_public_ip_list *t;
1309         uint32_t d;
1310
1311         uint32_t sum = 0;
1312
1313         for (t=ips; t != NULL; t=t->next) {
1314                 if (t->pnn != pnn) {
1315                         continue;
1316                 }
1317
1318                 /* Optimisation: We never calculate the distance
1319                  * between an address and itself.  This allows us to
1320                  * calculate the effect of removing an address from a
1321                  * node by simply calculating the distance between
1322                  * that address and all of the exitsing addresses.
1323                  * Moreover, we assume that we're only ever dealing
1324                  * with addresses from all_ips so we can identify an
1325                  * address via a pointer rather than doing a more
1326                  * expensive address comparison. */
1327                 if (&(t->addr) == ip) {
1328                         continue;
1329                 }
1330
1331                 d = ip_distance(ip, &(t->addr));
1332                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1333         }
1334
1335         return sum;
1336 }
1337
1338 /* Return the LCP2 imbalance metric for addresses currently assigned
1339    to the given node.
1340  * Not static, so we can easily link it into a unit test.
1341  */
1342 uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1343 {
1344         struct ctdb_public_ip_list *t;
1345
1346         uint32_t imbalance = 0;
1347
1348         for (t=all_ips; t!=NULL; t=t->next) {
1349                 if (t->pnn != pnn) {
1350                         continue;
1351                 }
1352                 /* Pass the rest of the IPs rather than the whole
1353                    all_ips input list.
1354                 */
1355                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1356         }
1357
1358         return imbalance;
1359 }
1360
1361 /* Allocate any unassigned IPs just by looping through the IPs and
1362  * finding the best node for each.
1363  * Not static, so we can easily link it into a unit test.
1364  */
1365 void basic_allocate_unassigned(struct ctdb_context *ctdb,
1366                                struct ctdb_node_map *nodemap,
1367                                uint32_t mask,
1368                                struct ctdb_public_ip_list *all_ips)
1369 {
1370         struct ctdb_public_ip_list *tmp_ip;
1371
1372         /* loop over all ip's and find a physical node to cover for 
1373            each unassigned ip.
1374         */
1375         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1376                 if (tmp_ip->pnn == -1) {
1377                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1378                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1379                                         ctdb_addr_to_str(&tmp_ip->addr)));
1380                         }
1381                 }
1382         }
1383 }
1384
1385 /* Basic non-deterministic rebalancing algorithm.
1386  * Not static, so we can easily link it into a unit test.
1387  */
1388 bool basic_failback(struct ctdb_context *ctdb,
1389                     struct ctdb_node_map *nodemap,
1390                     uint32_t mask,
1391                     struct ctdb_public_ip_list *all_ips,
1392                     int num_ips,
1393                     int *retries)
1394 {
1395         int i;
1396         int maxnode, maxnum=0, minnode, minnum=0, num;
1397         struct ctdb_public_ip_list *tmp_ip;
1398
1399         /* for each ip address, loop over all nodes that can serve
1400            this ip and make sure that the difference between the node
1401            serving the most and the node serving the least ip's are
1402            not greater than 1.
1403         */
1404         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1405                 if (tmp_ip->pnn == -1) {
1406                         continue;
1407                 }
1408
1409                 /* Get the highest and lowest number of ips's served by any 
1410                    valid node which can serve this ip.
1411                 */
1412                 maxnode = -1;
1413                 minnode = -1;
1414                 for (i=0;i<nodemap->num;i++) {
1415                         if (nodemap->nodes[i].flags & mask) {
1416                                 continue;
1417                         }
1418
1419                         /* only check nodes that can actually serve this ip */
1420                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1421                                 /* no it couldnt   so skip to the next node */
1422                                 continue;
1423                         }
1424
1425                         num = node_ip_coverage(ctdb, i, all_ips);
1426                         if (maxnode == -1) {
1427                                 maxnode = i;
1428                                 maxnum  = num;
1429                         } else {
1430                                 if (num > maxnum) {
1431                                         maxnode = i;
1432                                         maxnum  = num;
1433                                 }
1434                         }
1435                         if (minnode == -1) {
1436                                 minnode = i;
1437                                 minnum  = num;
1438                         } else {
1439                                 if (num < minnum) {
1440                                         minnode = i;
1441                                         minnum  = num;
1442                                 }
1443                         }
1444                 }
1445                 if (maxnode == -1) {
1446                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1447                                 ctdb_addr_to_str(&tmp_ip->addr)));
1448
1449                         continue;
1450                 }
1451
1452                 /* If we want deterministic IPs then dont try to reallocate 
1453                    them to spread out the load.
1454                 */
1455                 if (1 == ctdb->tunable.deterministic_public_ips) {
1456                         continue;
1457                 }
1458
1459                 /* if the spread between the smallest and largest coverage by
1460                    a node is >=2 we steal one of the ips from the node with
1461                    most coverage to even things out a bit.
1462                    try to do this a limited number of times since we dont
1463                    want to spend too much time balancing the ip coverage.
1464                 */
1465                 if ( (maxnum > minnum+1)
1466                      && (*retries < (num_ips + 5)) ){
1467                         struct ctdb_public_ip_list *tmp;
1468
1469                         /* mark one of maxnode's vnn's as unassigned and try
1470                            again
1471                         */
1472                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1473                                 if (tmp->pnn == maxnode) {
1474                                         tmp->pnn = -1;
1475                                         (*retries)++;
1476                                         return true;
1477                                 }
1478                         }
1479                 }
1480         }
1481
1482         return false;
1483 }
1484
1485 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1486  * that we can unit test it.
1487  * Not static, so we can easily link it into a unit test.
1488  */
1489 void lcp2_init(struct ctdb_context * tmp_ctx,
1490                struct ctdb_node_map * nodemap,
1491                uint32_t mask,
1492                struct ctdb_public_ip_list *all_ips,
1493                uint32_t **lcp2_imbalances,
1494                bool **newly_healthy)
1495 {
1496         int i;
1497         struct ctdb_public_ip_list *tmp_ip;
1498
1499         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1500         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1501         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1502         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1503
1504         for (i=0;i<nodemap->num;i++) {
1505                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1506                 /* First step: is the node "healthy"? */
1507                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1508         }
1509
1510         /* 2nd step: if a ndoe has IPs assigned then it must have been
1511          * healthy before, so we remove it from consideration... */
1512         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1513                 if (tmp_ip->pnn != -1) {
1514                         (*newly_healthy)[tmp_ip->pnn] = false;
1515                 }
1516         }
1517 }
1518
1519 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1520  * the IP/node combination that will cost the least.
1521  * Not static, so we can easily link it into a unit test.
1522  */
1523 void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1524                               struct ctdb_node_map *nodemap,
1525                               uint32_t mask,
1526                               struct ctdb_public_ip_list *all_ips,
1527                               uint32_t *lcp2_imbalances)
1528 {
1529         struct ctdb_public_ip_list *tmp_ip;
1530         int dstnode;
1531
1532         int minnode;
1533         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1534         struct ctdb_public_ip_list *minip;
1535
1536         bool should_loop = true;
1537         bool have_unassigned = true;
1538
1539         while (have_unassigned && should_loop) {
1540                 should_loop = false;
1541
1542                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1543                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1544
1545                 minnode = -1;
1546                 mindsum = 0;
1547                 minip = NULL;
1548
1549                 /* loop over each unassigned ip. */
1550                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1551                         if (tmp_ip->pnn != -1) {
1552                                 continue;
1553                         }
1554
1555                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1556                                 /* only check nodes that can actually serve this ip */
1557                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1558                                         /* no it couldnt   so skip to the next node */
1559                                         continue;
1560                                 }
1561                                 if (nodemap->nodes[dstnode].flags & mask) {
1562                                         continue;
1563                                 }
1564
1565                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1566                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1567                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1568                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1569                                                    dstnode,
1570                                                    dstimbl - lcp2_imbalances[dstnode]));
1571
1572
1573                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1574                                         minnode = dstnode;
1575                                         minimbl = dstimbl;
1576                                         mindsum = dstdsum;
1577                                         minip = tmp_ip;
1578                                         should_loop = true;
1579                                 }
1580                         }
1581                 }
1582
1583                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1584
1585                 /* If we found one then assign it to the given node. */
1586                 if (minnode != -1) {
1587                         minip->pnn = minnode;
1588                         lcp2_imbalances[minnode] = minimbl;
1589                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1590                                           ctdb_addr_to_str(&(minip->addr)),
1591                                           minnode,
1592                                           mindsum));
1593                 }
1594
1595                 /* There might be a better way but at least this is clear. */
1596                 have_unassigned = false;
1597                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1598                         if (tmp_ip->pnn == -1) {
1599                                 have_unassigned = true;
1600                         }
1601                 }
1602         }
1603
1604         /* We know if we have an unassigned addresses so we might as
1605          * well optimise.
1606          */
1607         if (have_unassigned) {
1608                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1609                         if (tmp_ip->pnn == -1) {
1610                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1611                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1612                         }
1613                 }
1614         }
1615 }
1616
1617 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1618  * node with the highest LCP2 imbalance, and then determines the best
1619  * IP/destination node combination to move from the source node.
1620  *
1621  * Not static, so we can easily link it into a unit test.
1622  */
1623 bool lcp2_failback(struct ctdb_context *ctdb,
1624                    struct ctdb_node_map *nodemap,
1625                    uint32_t mask,
1626                    struct ctdb_public_ip_list *all_ips,
1627                    uint32_t *lcp2_imbalances,
1628                    bool *newly_healthy)
1629 {
1630         int srcnode, dstnode, mindstnode, i, num_newly_healthy;
1631         uint32_t srcimbl, srcdsum, maximbl, dstimbl, dstdsum;
1632         uint32_t minsrcimbl, mindstimbl, b;
1633         struct ctdb_public_ip_list *minip;
1634         struct ctdb_public_ip_list *tmp_ip;
1635
1636         /* It is only worth continuing if we have suitable target
1637          * nodes to transfer IPs to.  This check is much cheaper than
1638          * continuing on...
1639          */
1640         num_newly_healthy = 0;
1641         for (i = 0; i < nodemap->num; i++) {
1642                 if (newly_healthy[i]) {
1643                         num_newly_healthy++;
1644                 }
1645         }
1646         if (num_newly_healthy == 0) {
1647                 return false;
1648         }
1649
1650         /* Get the node with the highest imbalance metric. */
1651         srcnode = -1;
1652         maximbl = 0;
1653         for (i=0; i < nodemap->num; i++) {
1654                 b = lcp2_imbalances[i];
1655                 if ((srcnode == -1) || (b > maximbl)) {
1656                         srcnode = i;
1657                         maximbl = b;
1658                 }
1659         }
1660
1661         /* This means that all nodes had 0 or 1 addresses, so can't be
1662          * imbalanced.
1663          */
1664         if (maximbl == 0) {
1665                 return false;
1666         }
1667
1668         /* Find an IP and destination node that best reduces imbalance. */
1669         minip = NULL;
1670         minsrcimbl = 0;
1671         mindstnode = -1;
1672         mindstimbl = 0;
1673
1674         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1675         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, maximbl));
1676
1677         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1678                 /* Only consider addresses on srcnode. */
1679                 if (tmp_ip->pnn != srcnode) {
1680                         continue;
1681                 }
1682
1683                 /* What is this IP address costing the source node? */
1684                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1685                 srcimbl = maximbl - srcdsum;
1686
1687                 /* Consider this IP address would cost each potential
1688                  * destination node.  Destination nodes are limited to
1689                  * those that are newly healthy, since we don't want
1690                  * to do gratuitous failover of IPs just to make minor
1691                  * balance improvements.
1692                  */
1693                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1694                         if (! newly_healthy[dstnode]) {
1695                                 continue;
1696                         }
1697                         /* only check nodes that can actually serve this ip */
1698                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1699                                 /* no it couldnt   so skip to the next node */
1700                                 continue;
1701                         }
1702
1703                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1704                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1705                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1706                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1707                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1708                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1709
1710                         if ((dstimbl < maximbl) && (dstdsum < srcdsum) && \
1711                             ((mindstnode == -1) ||                              \
1712                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1713
1714                                 minip = tmp_ip;
1715                                 minsrcimbl = srcimbl;
1716                                 mindstnode = dstnode;
1717                                 mindstimbl = dstimbl;
1718                         }
1719                 }
1720         }
1721         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1722
1723         if (mindstnode != -1) {
1724                 /* We found a move that makes things better... */
1725                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1726                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1727                                   ctdb_addr_to_str(&(minip->addr)),
1728                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1729
1730
1731                 lcp2_imbalances[srcnode] = srcimbl;
1732                 lcp2_imbalances[mindstnode] = mindstimbl;
1733                 minip->pnn = mindstnode;
1734
1735                 return true;
1736         }
1737
1738         return false;
1739         
1740 }
1741
1742 /* The calculation part of the IP allocation algorithm.
1743  * Not static, so we can easily link it into a unit test.
1744  */
1745 void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1746                             struct ctdb_node_map *nodemap,
1747                             struct ctdb_public_ip_list **all_ips_p)
1748 {
1749         int i, num_healthy, retries, num_ips;
1750         uint32_t mask;
1751         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1752         uint32_t *lcp2_imbalances;
1753         bool *newly_healthy;
1754
1755         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1756
1757         /* Count how many completely healthy nodes we have */
1758         num_healthy = 0;
1759         for (i=0;i<nodemap->num;i++) {
1760                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1761                         num_healthy++;
1762                 }
1763         }
1764
1765         if (num_healthy > 0) {
1766                 /* We have healthy nodes, so only consider them for 
1767                    serving public addresses
1768                 */
1769                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1770         } else {
1771                 /* We didnt have any completely healthy nodes so
1772                    use "disabled" nodes as a fallback
1773                 */
1774                 mask = NODE_FLAGS_INACTIVE;
1775         }
1776
1777         /* since nodes only know about those public addresses that
1778            can be served by that particular node, no single node has
1779            a full list of all public addresses that exist in the cluster.
1780            Walk over all node structures and create a merged list of
1781            all public addresses that exist in the cluster.
1782
1783            keep the tree of ips around as ctdb->ip_tree
1784         */
1785         all_ips = create_merged_ip_list(ctdb);
1786         *all_ips_p = all_ips; /* minimal code changes */
1787
1788         /* Count how many ips we have */
1789         num_ips = 0;
1790         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1791                 num_ips++;
1792         }
1793
1794         /* If we want deterministic ip allocations, i.e. that the ip addresses
1795            will always be allocated the same way for a specific set of
1796            available/unavailable nodes.
1797         */
1798         if (1 == ctdb->tunable.deterministic_public_ips) {              
1799                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1800                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1801                         tmp_ip->pnn = i%nodemap->num;
1802                 }
1803         }
1804
1805
1806         /* mark all public addresses with a masked node as being served by
1807            node -1
1808         */
1809         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1810                 if (tmp_ip->pnn == -1) {
1811                         continue;
1812                 }
1813                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1814                         tmp_ip->pnn = -1;
1815                 }
1816         }
1817
1818         /* verify that the assigned nodes can serve that public ip
1819            and set it to -1 if not
1820         */
1821         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1822                 if (tmp_ip->pnn == -1) {
1823                         continue;
1824                 }
1825                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1826                         /* this node can not serve this ip. */
1827                         tmp_ip->pnn = -1;
1828                 }
1829         }
1830
1831         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1832                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1833         }
1834
1835         /* now we must redistribute all public addresses with takeover node
1836            -1 among the nodes available
1837         */
1838         retries = 0;
1839 try_again:
1840         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1841                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1842         } else {
1843                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1844         }
1845
1846         /* If we dont want ips to fail back after a node becomes healthy
1847            again, we wont even try to reallocat the ip addresses so that
1848            they are evenly spread out.
1849            This can NOT be used at the same time as DeterministicIPs !
1850         */
1851         if (1 == ctdb->tunable.no_ip_failback) {
1852                 if (1 == ctdb->tunable.deterministic_public_ips) {
1853                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1854                 }
1855                 goto finished;
1856         }
1857
1858
1859         /* now, try to make sure the ip adresses are evenly distributed
1860            across the node.
1861         */
1862         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1863                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1864                         goto try_again;
1865                 }
1866         } else {
1867                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1868                         goto try_again;
1869                 }
1870         }
1871
1872         /* finished distributing the public addresses, now just send the 
1873            info out to the nodes
1874         */
1875 finished:
1876
1877         /* at this point ->pnn is the node which will own each IP
1878            or -1 if there is no node that can cover this ip
1879         */
1880
1881         return;
1882 }
1883
1884 /*
1885   make any IP alias changes for public addresses that are necessary 
1886  */
1887 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1888 {
1889         int i;
1890         struct ctdb_public_ip ip;
1891         struct ctdb_public_ipv4 ipv4;
1892         uint32_t *nodes;
1893         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1894         TDB_DATA data;
1895         struct timeval timeout;
1896         struct client_async_data *async_data;
1897         struct ctdb_client_control_state *state;
1898         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1899
1900         /*
1901          * ip failover is completely disabled, just send out the 
1902          * ipreallocated event.
1903          */
1904         if (ctdb->tunable.disable_ip_failover != 0) {
1905                 goto ipreallocated;
1906         }
1907
1908         ZERO_STRUCT(ip);
1909
1910         /* Do the IP reassignment calculations */
1911         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
1912
1913         /* now tell all nodes to delete any alias that they should not
1914            have.  This will be a NOOP on nodes that don't currently
1915            hold the given alias */
1916         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1917         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1918
1919         for (i=0;i<nodemap->num;i++) {
1920                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1921                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1922                         continue;
1923                 }
1924
1925                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1926                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1927                                 /* This node should be serving this
1928                                    vnn so dont tell it to release the ip
1929                                 */
1930                                 continue;
1931                         }
1932                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1933                                 ipv4.pnn = tmp_ip->pnn;
1934                                 ipv4.sin = tmp_ip->addr.ip;
1935
1936                                 timeout = TAKEOVER_TIMEOUT();
1937                                 data.dsize = sizeof(ipv4);
1938                                 data.dptr  = (uint8_t *)&ipv4;
1939                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1940                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1941                                                 data, async_data,
1942                                                 &timeout, NULL);
1943                         } else {
1944                                 ip.pnn  = tmp_ip->pnn;
1945                                 ip.addr = tmp_ip->addr;
1946
1947                                 timeout = TAKEOVER_TIMEOUT();
1948                                 data.dsize = sizeof(ip);
1949                                 data.dptr  = (uint8_t *)&ip;
1950                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1951                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1952                                                 data, async_data,
1953                                                 &timeout, NULL);
1954                         }
1955
1956                         if (state == NULL) {
1957                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1958                                 talloc_free(tmp_ctx);
1959                                 return -1;
1960                         }
1961                 
1962                         ctdb_client_async_add(async_data, state);
1963                 }
1964         }
1965         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1966                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1967                 talloc_free(tmp_ctx);
1968                 return -1;
1969         }
1970         talloc_free(async_data);
1971
1972
1973         /* tell all nodes to get their own IPs */
1974         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1975         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1976         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1977                 if (tmp_ip->pnn == -1) {
1978                         /* this IP won't be taken over */
1979                         continue;
1980                 }
1981
1982                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1983                         ipv4.pnn = tmp_ip->pnn;
1984                         ipv4.sin = tmp_ip->addr.ip;
1985
1986                         timeout = TAKEOVER_TIMEOUT();
1987                         data.dsize = sizeof(ipv4);
1988                         data.dptr  = (uint8_t *)&ipv4;
1989                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1990                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1991                                         data, async_data,
1992                                         &timeout, NULL);
1993                 } else {
1994                         ip.pnn  = tmp_ip->pnn;
1995                         ip.addr = tmp_ip->addr;
1996
1997                         timeout = TAKEOVER_TIMEOUT();
1998                         data.dsize = sizeof(ip);
1999                         data.dptr  = (uint8_t *)&ip;
2000                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2001                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2002                                         data, async_data,
2003                                         &timeout, NULL);
2004                 }
2005                 if (state == NULL) {
2006                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2007                         talloc_free(tmp_ctx);
2008                         return -1;
2009                 }
2010                 
2011                 ctdb_client_async_add(async_data, state);
2012         }
2013         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2014                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2015                 talloc_free(tmp_ctx);
2016                 return -1;
2017         }
2018
2019 ipreallocated:
2020         /* tell all nodes to update natwg */
2021         /* send the flags update natgw on all connected nodes */
2022         data.dptr  = discard_const("ipreallocated");
2023         data.dsize = strlen((char *)data.dptr) + 1; 
2024         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2025         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2026                                       nodes, 0, TAKEOVER_TIMEOUT(),
2027                                       false, data,
2028                                       NULL, NULL,
2029                                       NULL) != 0) {
2030                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2031         }
2032
2033         talloc_free(tmp_ctx);
2034         return 0;
2035 }
2036
2037
2038 /*
2039   destroy a ctdb_client_ip structure
2040  */
2041 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2042 {
2043         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2044                 ctdb_addr_to_str(&ip->addr),
2045                 ntohs(ip->addr.ip.sin_port),
2046                 ip->client_id));
2047
2048         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2049         return 0;
2050 }
2051
2052 /*
2053   called by a client to inform us of a TCP connection that it is managing
2054   that should tickled with an ACK when IP takeover is done
2055   we handle both the old ipv4 style of packets as well as the new ipv4/6
2056   pdus.
2057  */
2058 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2059                                 TDB_DATA indata)
2060 {
2061         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2062         struct ctdb_control_tcp *old_addr = NULL;
2063         struct ctdb_control_tcp_addr new_addr;
2064         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2065         struct ctdb_tcp_list *tcp;
2066         struct ctdb_tcp_connection t;
2067         int ret;
2068         TDB_DATA data;
2069         struct ctdb_client_ip *ip;
2070         struct ctdb_vnn *vnn;
2071         ctdb_sock_addr addr;
2072
2073         switch (indata.dsize) {
2074         case sizeof(struct ctdb_control_tcp):
2075                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2076                 ZERO_STRUCT(new_addr);
2077                 tcp_sock = &new_addr;
2078                 tcp_sock->src.ip  = old_addr->src;
2079                 tcp_sock->dest.ip = old_addr->dest;
2080                 break;
2081         case sizeof(struct ctdb_control_tcp_addr):
2082                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2083                 break;
2084         default:
2085                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2086                                  "to ctdb_control_tcp_client. size was %d but "
2087                                  "only allowed sizes are %lu and %lu\n",
2088                                  (int)indata.dsize,
2089                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2090                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2091                 return -1;
2092         }
2093
2094         addr = tcp_sock->src;
2095         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2096         addr = tcp_sock->dest;
2097         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2098
2099         ZERO_STRUCT(addr);
2100         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2101         vnn = find_public_ip_vnn(ctdb, &addr);
2102         if (vnn == NULL) {
2103                 switch (addr.sa.sa_family) {
2104                 case AF_INET:
2105                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2106                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2107                                         ctdb_addr_to_str(&addr)));
2108                         }
2109                         break;
2110                 case AF_INET6:
2111                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2112                                 ctdb_addr_to_str(&addr)));
2113                         break;
2114                 default:
2115                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2116                 }
2117
2118                 return 0;
2119         }
2120
2121         if (vnn->pnn != ctdb->pnn) {
2122                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2123                         ctdb_addr_to_str(&addr),
2124                         client_id, client->pid));
2125                 /* failing this call will tell smbd to die */
2126                 return -1;
2127         }
2128
2129         ip = talloc(client, struct ctdb_client_ip);
2130         CTDB_NO_MEMORY(ctdb, ip);
2131
2132         ip->ctdb      = ctdb;
2133         ip->addr      = addr;
2134         ip->client_id = client_id;
2135         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2136         DLIST_ADD(ctdb->client_ip_list, ip);
2137
2138         tcp = talloc(client, struct ctdb_tcp_list);
2139         CTDB_NO_MEMORY(ctdb, tcp);
2140
2141         tcp->connection.src_addr = tcp_sock->src;
2142         tcp->connection.dst_addr = tcp_sock->dest;
2143
2144         DLIST_ADD(client->tcp_list, tcp);
2145
2146         t.src_addr = tcp_sock->src;
2147         t.dst_addr = tcp_sock->dest;
2148
2149         data.dptr = (uint8_t *)&t;
2150         data.dsize = sizeof(t);
2151
2152         switch (addr.sa.sa_family) {
2153         case AF_INET:
2154                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2155                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2156                         ctdb_addr_to_str(&tcp_sock->src),
2157                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2158                 break;
2159         case AF_INET6:
2160                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2161                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2162                         ctdb_addr_to_str(&tcp_sock->src),
2163                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2164                 break;
2165         default:
2166                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2167         }
2168
2169
2170         /* tell all nodes about this tcp connection */
2171         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2172                                        CTDB_CONTROL_TCP_ADD,
2173                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2174         if (ret != 0) {
2175                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2176                 return -1;
2177         }
2178
2179         return 0;
2180 }
2181
2182 /*
2183   find a tcp address on a list
2184  */
2185 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2186                                            struct ctdb_tcp_connection *tcp)
2187 {
2188         int i;
2189
2190         if (array == NULL) {
2191                 return NULL;
2192         }
2193
2194         for (i=0;i<array->num;i++) {
2195                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2196                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2197                         return &array->connections[i];
2198                 }
2199         }
2200         return NULL;
2201 }
2202
2203
2204
2205 /*
2206   called by a daemon to inform us of a TCP connection that one of its
2207   clients managing that should tickled with an ACK when IP takeover is
2208   done
2209  */
2210 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2211 {
2212         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2213         struct ctdb_tcp_array *tcparray;
2214         struct ctdb_tcp_connection tcp;
2215         struct ctdb_vnn *vnn;
2216
2217         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2218         if (vnn == NULL) {
2219                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2220                         ctdb_addr_to_str(&p->dst_addr)));
2221
2222                 return -1;
2223         }
2224
2225
2226         tcparray = vnn->tcp_array;
2227
2228         /* If this is the first tickle */
2229         if (tcparray == NULL) {
2230                 tcparray = talloc_size(ctdb->nodes, 
2231                         offsetof(struct ctdb_tcp_array, connections) +
2232                         sizeof(struct ctdb_tcp_connection) * 1);
2233                 CTDB_NO_MEMORY(ctdb, tcparray);
2234                 vnn->tcp_array = tcparray;
2235
2236                 tcparray->num = 0;
2237                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2238                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2239
2240                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2241                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2242                 tcparray->num++;
2243
2244                 if (tcp_update_needed) {
2245                         vnn->tcp_update_needed = true;
2246                 }
2247                 return 0;
2248         }
2249
2250
2251         /* Do we already have this tickle ?*/
2252         tcp.src_addr = p->src_addr;
2253         tcp.dst_addr = p->dst_addr;
2254         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2255                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2256                         ctdb_addr_to_str(&tcp.dst_addr),
2257                         ntohs(tcp.dst_addr.ip.sin_port),
2258                         vnn->pnn));
2259                 return 0;
2260         }
2261
2262         /* A new tickle, we must add it to the array */
2263         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2264                                         struct ctdb_tcp_connection,
2265                                         tcparray->num+1);
2266         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2267
2268         vnn->tcp_array = tcparray;
2269         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2270         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2271         tcparray->num++;
2272                                 
2273         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2274                 ctdb_addr_to_str(&tcp.dst_addr),
2275                 ntohs(tcp.dst_addr.ip.sin_port),
2276                 vnn->pnn));
2277
2278         if (tcp_update_needed) {
2279                 vnn->tcp_update_needed = true;
2280         }
2281
2282         return 0;
2283 }
2284
2285
2286 /*
2287   called by a daemon to inform us of a TCP connection that one of its
2288   clients managing that should tickled with an ACK when IP takeover is
2289   done
2290  */
2291 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2292 {
2293         struct ctdb_tcp_connection *tcpp;
2294         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2295
2296         if (vnn == NULL) {
2297                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2298                         ctdb_addr_to_str(&conn->dst_addr)));
2299                 return;
2300         }
2301
2302         /* if the array is empty we cant remove it
2303            and we dont need to do anything
2304          */
2305         if (vnn->tcp_array == NULL) {
2306                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2307                         ctdb_addr_to_str(&conn->dst_addr),
2308                         ntohs(conn->dst_addr.ip.sin_port)));
2309                 return;
2310         }
2311
2312
2313         /* See if we know this connection
2314            if we dont know this connection  then we dont need to do anything
2315          */
2316         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2317         if (tcpp == NULL) {
2318                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2319                         ctdb_addr_to_str(&conn->dst_addr),
2320                         ntohs(conn->dst_addr.ip.sin_port)));
2321                 return;
2322         }
2323
2324
2325         /* We need to remove this entry from the array.
2326            Instead of allocating a new array and copying data to it
2327            we cheat and just copy the last entry in the existing array
2328            to the entry that is to be removed and just shring the 
2329            ->num field
2330          */
2331         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2332         vnn->tcp_array->num--;
2333
2334         /* If we deleted the last entry we also need to remove the entire array
2335          */
2336         if (vnn->tcp_array->num == 0) {
2337                 talloc_free(vnn->tcp_array);
2338                 vnn->tcp_array = NULL;
2339         }               
2340
2341         vnn->tcp_update_needed = true;
2342
2343         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2344                 ctdb_addr_to_str(&conn->src_addr),
2345                 ntohs(conn->src_addr.ip.sin_port)));
2346 }
2347
2348
2349 /*
2350   called by a daemon to inform us of a TCP connection that one of its
2351   clients used are no longer needed in the tickle database
2352  */
2353 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2354 {
2355         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2356
2357         ctdb_remove_tcp_connection(ctdb, conn);
2358
2359         return 0;
2360 }
2361
2362
2363 /*
2364   called when a daemon restarts - send all tickes for all public addresses
2365   we are serving immediately to the new node.
2366  */
2367 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2368 {
2369 /*XXX here we should send all tickes we are serving to the new node */
2370         return 0;
2371 }
2372
2373
2374 /*
2375   called when a client structure goes away - hook to remove
2376   elements from the tcp_list in all daemons
2377  */
2378 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2379 {
2380         while (client->tcp_list) {
2381                 struct ctdb_tcp_list *tcp = client->tcp_list;
2382                 DLIST_REMOVE(client->tcp_list, tcp);
2383                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2384         }
2385 }
2386
2387
2388 /*
2389   release all IPs on shutdown
2390  */
2391 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2392 {
2393         struct ctdb_vnn *vnn;
2394
2395         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2396                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2397                         ctdb_vnn_unassign_iface(ctdb, vnn);
2398                         continue;
2399                 }
2400                 if (!vnn->iface) {
2401                         continue;
2402                 }
2403                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2404                                   ctdb_vnn_iface_string(vnn),
2405                                   ctdb_addr_to_str(&vnn->public_address),
2406                                   vnn->public_netmask_bits);
2407                 release_kill_clients(ctdb, &vnn->public_address);
2408                 ctdb_vnn_unassign_iface(ctdb, vnn);
2409         }
2410 }
2411
2412
2413 /*
2414   get list of public IPs
2415  */
2416 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2417                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2418 {
2419         int i, num, len;
2420         struct ctdb_all_public_ips *ips;
2421         struct ctdb_vnn *vnn;
2422         bool only_available = false;
2423
2424         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2425                 only_available = true;
2426         }
2427
2428         /* count how many public ip structures we have */
2429         num = 0;
2430         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2431                 num++;
2432         }
2433
2434         len = offsetof(struct ctdb_all_public_ips, ips) + 
2435                 num*sizeof(struct ctdb_public_ip);
2436         ips = talloc_zero_size(outdata, len);
2437         CTDB_NO_MEMORY(ctdb, ips);
2438
2439         i = 0;
2440         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2441                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2442                         continue;
2443                 }
2444                 ips->ips[i].pnn  = vnn->pnn;
2445                 ips->ips[i].addr = vnn->public_address;
2446                 i++;
2447         }
2448         ips->num = i;
2449         len = offsetof(struct ctdb_all_public_ips, ips) +
2450                 i*sizeof(struct ctdb_public_ip);
2451
2452         outdata->dsize = len;
2453         outdata->dptr  = (uint8_t *)ips;
2454
2455         return 0;
2456 }
2457
2458
2459 /*
2460   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2461  */
2462 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2463                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2464 {
2465         int i, num, len;
2466         struct ctdb_all_public_ipsv4 *ips;
2467         struct ctdb_vnn *vnn;
2468
2469         /* count how many public ip structures we have */
2470         num = 0;
2471         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2472                 if (vnn->public_address.sa.sa_family != AF_INET) {
2473                         continue;
2474                 }
2475                 num++;
2476         }
2477
2478         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2479                 num*sizeof(struct ctdb_public_ipv4);
2480         ips = talloc_zero_size(outdata, len);
2481         CTDB_NO_MEMORY(ctdb, ips);
2482
2483         outdata->dsize = len;
2484         outdata->dptr  = (uint8_t *)ips;
2485
2486         ips->num = num;
2487         i = 0;
2488         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2489                 if (vnn->public_address.sa.sa_family != AF_INET) {
2490                         continue;
2491                 }
2492                 ips->ips[i].pnn = vnn->pnn;
2493                 ips->ips[i].sin = vnn->public_address.ip;
2494                 i++;
2495         }
2496
2497         return 0;
2498 }
2499
2500 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2501                                         struct ctdb_req_control *c,
2502                                         TDB_DATA indata,
2503                                         TDB_DATA *outdata)
2504 {
2505         int i, num, len;
2506         ctdb_sock_addr *addr;
2507         struct ctdb_control_public_ip_info *info;
2508         struct ctdb_vnn *vnn;
2509
2510         addr = (ctdb_sock_addr *)indata.dptr;
2511
2512         vnn = find_public_ip_vnn(ctdb, addr);
2513         if (vnn == NULL) {
2514                 /* if it is not a public ip   it could be our 'single ip' */
2515                 if (ctdb->single_ip_vnn) {
2516                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2517                                 vnn = ctdb->single_ip_vnn;
2518                         }
2519                 }
2520         }
2521         if (vnn == NULL) {
2522                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2523                                  "'%s'not a public address\n",
2524                                  ctdb_addr_to_str(addr)));
2525                 return -1;
2526         }
2527
2528         /* count how many public ip structures we have */
2529         num = 0;
2530         for (;vnn->ifaces[num];) {
2531                 num++;
2532         }
2533
2534         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2535                 num*sizeof(struct ctdb_control_iface_info);
2536         info = talloc_zero_size(outdata, len);
2537         CTDB_NO_MEMORY(ctdb, info);
2538
2539         info->ip.addr = vnn->public_address;
2540         info->ip.pnn = vnn->pnn;
2541         info->active_idx = 0xFFFFFFFF;
2542
2543         for (i=0; vnn->ifaces[i]; i++) {
2544                 struct ctdb_iface *cur;
2545
2546                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2547                 if (cur == NULL) {
2548                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2549                                            vnn->ifaces[i]));
2550                         return -1;
2551                 }
2552                 if (vnn->iface == cur) {
2553                         info->active_idx = i;
2554                 }
2555                 strcpy(info->ifaces[i].name, cur->name);
2556                 info->ifaces[i].link_state = cur->link_up;
2557                 info->ifaces[i].references = cur->references;
2558         }
2559         info->num = i;
2560         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2561                 i*sizeof(struct ctdb_control_iface_info);
2562
2563         outdata->dsize = len;
2564         outdata->dptr  = (uint8_t *)info;
2565
2566         return 0;
2567 }
2568
2569 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2570                                 struct ctdb_req_control *c,
2571                                 TDB_DATA *outdata)
2572 {
2573         int i, num, len;
2574         struct ctdb_control_get_ifaces *ifaces;
2575         struct ctdb_iface *cur;
2576
2577         /* count how many public ip structures we have */
2578         num = 0;
2579         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2580                 num++;
2581         }
2582
2583         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2584                 num*sizeof(struct ctdb_control_iface_info);
2585         ifaces = talloc_zero_size(outdata, len);
2586         CTDB_NO_MEMORY(ctdb, ifaces);
2587
2588         i = 0;
2589         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2590                 strcpy(ifaces->ifaces[i].name, cur->name);
2591                 ifaces->ifaces[i].link_state = cur->link_up;
2592                 ifaces->ifaces[i].references = cur->references;
2593                 i++;
2594         }
2595         ifaces->num = i;
2596         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2597                 i*sizeof(struct ctdb_control_iface_info);
2598
2599         outdata->dsize = len;
2600         outdata->dptr  = (uint8_t *)ifaces;
2601
2602         return 0;
2603 }
2604
2605 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2606                                     struct ctdb_req_control *c,
2607                                     TDB_DATA indata)
2608 {
2609         struct ctdb_control_iface_info *info;
2610         struct ctdb_iface *iface;
2611         bool link_up = false;
2612
2613         info = (struct ctdb_control_iface_info *)indata.dptr;
2614
2615         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2616                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2617                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2618                                   len, len, info->name));
2619                 return -1;
2620         }
2621
2622         switch (info->link_state) {
2623         case 0:
2624                 link_up = false;
2625                 break;
2626         case 1:
2627                 link_up = true;
2628                 break;
2629         default:
2630                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2631                                   (unsigned int)info->link_state));
2632                 return -1;
2633         }
2634
2635         if (info->references != 0) {
2636                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2637                                   (unsigned int)info->references));
2638                 return -1;
2639         }
2640
2641         iface = ctdb_find_iface(ctdb, info->name);
2642         if (iface == NULL) {
2643                 DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
2644                                   info->name));
2645                 return -1;
2646         }
2647
2648         if (link_up == iface->link_up) {
2649                 return 0;
2650         }
2651
2652         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2653               ("iface[%s] has changed it's link status %s => %s\n",
2654                iface->name,
2655                iface->link_up?"up":"down",
2656                link_up?"up":"down"));
2657
2658         iface->link_up = link_up;
2659         return 0;
2660 }
2661
2662
2663 /* 
2664    structure containing the listening socket and the list of tcp connections
2665    that the ctdb daemon is to kill
2666 */
2667 struct ctdb_kill_tcp {
2668         struct ctdb_vnn *vnn;
2669         struct ctdb_context *ctdb;
2670         int capture_fd;
2671         struct fd_event *fde;
2672         trbt_tree_t *connections;
2673         void *private_data;
2674 };
2675
2676 /*
2677   a tcp connection that is to be killed
2678  */
2679 struct ctdb_killtcp_con {
2680         ctdb_sock_addr src_addr;
2681         ctdb_sock_addr dst_addr;
2682         int count;
2683         struct ctdb_kill_tcp *killtcp;
2684 };
2685
2686 /* this function is used to create a key to represent this socketpair
2687    in the killtcp tree.
2688    this key is used to insert and lookup matching socketpairs that are
2689    to be tickled and RST
2690 */
2691 #define KILLTCP_KEYLEN  10
2692 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2693 {
2694         static uint32_t key[KILLTCP_KEYLEN];
2695
2696         bzero(key, sizeof(key));
2697
2698         if (src->sa.sa_family != dst->sa.sa_family) {
2699                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2700                 return key;
2701         }
2702         
2703         switch (src->sa.sa_family) {
2704         case AF_INET:
2705                 key[0]  = dst->ip.sin_addr.s_addr;
2706                 key[1]  = src->ip.sin_addr.s_addr;
2707                 key[2]  = dst->ip.sin_port;
2708                 key[3]  = src->ip.sin_port;
2709                 break;
2710         case AF_INET6:
2711                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2712                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2713                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2714                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2715                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2716                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2717                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2718                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2719                 key[8]  = dst->ip6.sin6_port;
2720                 key[9]  = src->ip6.sin6_port;
2721                 break;
2722         default:
2723                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2724                 return key;
2725         }
2726
2727         return key;
2728 }
2729
2730 /*
2731   called when we get a read event on the raw socket
2732  */
2733 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2734                                 uint16_t flags, void *private_data)
2735 {
2736         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2737         struct ctdb_killtcp_con *con;
2738         ctdb_sock_addr src, dst;
2739         uint32_t ack_seq, seq;
2740
2741         if (!(flags & EVENT_FD_READ)) {
2742                 return;
2743         }
2744
2745         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2746                                 killtcp->private_data,
2747                                 &src, &dst,
2748                                 &ack_seq, &seq) != 0) {
2749                 /* probably a non-tcp ACK packet */
2750                 return;
2751         }
2752
2753         /* check if we have this guy in our list of connections
2754            to kill
2755         */
2756         con = trbt_lookuparray32(killtcp->connections, 
2757                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2758         if (con == NULL) {
2759                 /* no this was some other packet we can just ignore */
2760                 return;
2761         }
2762
2763         /* This one has been tickled !
2764            now reset him and remove him from the list.
2765          */
2766         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2767                 ntohs(con->dst_addr.ip.sin_port),
2768                 ctdb_addr_to_str(&con->src_addr),
2769                 ntohs(con->src_addr.ip.sin_port)));
2770
2771         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2772         talloc_free(con);
2773 }
2774
2775
2776 /* when traversing the list of all tcp connections to send tickle acks to
2777    (so that we can capture the ack coming back and kill the connection
2778     by a RST)
2779    this callback is called for each connection we are currently trying to kill
2780 */
2781 static void tickle_connection_traverse(void *param, void *data)
2782 {
2783         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2784
2785         /* have tried too many times, just give up */
2786         if (con->count >= 5) {
2787                 /* can't delete in traverse: reparent to delete_cons */
2788                 talloc_steal(param, con);
2789                 return;
2790         }
2791
2792         /* othervise, try tickling it again */
2793         con->count++;
2794         ctdb_sys_send_tcp(
2795                 (ctdb_sock_addr *)&con->dst_addr,
2796                 (ctdb_sock_addr *)&con->src_addr,
2797                 0, 0, 0);
2798 }
2799
2800
2801 /* 
2802    called every second until all sentenced connections have been reset
2803  */
2804 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2805                                               struct timeval t, void *private_data)
2806 {
2807         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2808         void *delete_cons = talloc_new(NULL);
2809
2810         /* loop over all connections sending tickle ACKs */
2811         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2812
2813         /* now we've finished traverse, it's safe to do deletion. */
2814         talloc_free(delete_cons);
2815
2816         /* If there are no more connections to kill we can remove the
2817            entire killtcp structure
2818          */
2819         if ( (killtcp->connections == NULL) || 
2820              (killtcp->connections->root == NULL) ) {
2821                 talloc_free(killtcp);
2822                 return;
2823         }
2824
2825         /* try tickling them again in a seconds time
2826          */
2827         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2828                         ctdb_tickle_sentenced_connections, killtcp);
2829 }
2830
2831 /*
2832   destroy the killtcp structure
2833  */
2834 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2835 {
2836         if (killtcp->vnn) {
2837                 killtcp->vnn->killtcp = NULL;
2838         }
2839         return 0;
2840 }
2841
2842
2843 /* nothing fancy here, just unconditionally replace any existing
2844    connection structure with the new one.
2845
2846    dont even free the old one if it did exist, that one is talloc_stolen
2847    by the same node in the tree anyway and will be deleted when the new data 
2848    is deleted
2849 */
2850 static void *add_killtcp_callback(void *parm, void *data)
2851 {
2852         return parm;
2853 }
2854
2855 /*
2856   add a tcp socket to the list of connections we want to RST
2857  */
2858 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2859                                        ctdb_sock_addr *s,
2860                                        ctdb_sock_addr *d)
2861 {
2862         ctdb_sock_addr src, dst;
2863         struct ctdb_kill_tcp *killtcp;
2864         struct ctdb_killtcp_con *con;
2865         struct ctdb_vnn *vnn;
2866
2867         ctdb_canonicalize_ip(s, &src);
2868         ctdb_canonicalize_ip(d, &dst);
2869
2870         vnn = find_public_ip_vnn(ctdb, &dst);
2871         if (vnn == NULL) {
2872                 vnn = find_public_ip_vnn(ctdb, &src);
2873         }
2874         if (vnn == NULL) {
2875                 /* if it is not a public ip   it could be our 'single ip' */
2876                 if (ctdb->single_ip_vnn) {
2877                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2878                                 vnn = ctdb->single_ip_vnn;
2879                         }
2880                 }
2881         }
2882         if (vnn == NULL) {
2883                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2884                 return -1;
2885         }
2886
2887         killtcp = vnn->killtcp;
2888         
2889         /* If this is the first connection to kill we must allocate
2890            a new structure
2891          */
2892         if (killtcp == NULL) {
2893                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2894                 CTDB_NO_MEMORY(ctdb, killtcp);
2895
2896                 killtcp->vnn         = vnn;
2897                 killtcp->ctdb        = ctdb;
2898                 killtcp->capture_fd  = -1;
2899                 killtcp->connections = trbt_create(killtcp, 0);
2900
2901                 vnn->killtcp         = killtcp;
2902                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2903         }
2904
2905
2906
2907         /* create a structure that describes this connection we want to
2908            RST and store it in killtcp->connections
2909         */
2910         con = talloc(killtcp, struct ctdb_killtcp_con);
2911         CTDB_NO_MEMORY(ctdb, con);
2912         con->src_addr = src;
2913         con->dst_addr = dst;
2914         con->count    = 0;
2915         con->killtcp  = killtcp;
2916
2917
2918         trbt_insertarray32_callback(killtcp->connections,
2919                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2920                         add_killtcp_callback, con);
2921
2922         /* 
2923            If we dont have a socket to listen on yet we must create it
2924          */
2925         if (killtcp->capture_fd == -1) {
2926                 const char *iface = ctdb_vnn_iface_string(vnn);
2927                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2928                 if (killtcp->capture_fd == -1) {
2929                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2930                                           "socket on iface '%s' for killtcp (%s)\n",
2931                                           iface, strerror(errno)));
2932                         goto failed;
2933                 }
2934         }
2935
2936
2937         if (killtcp->fde == NULL) {
2938                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2939                                             EVENT_FD_READ,
2940                                             capture_tcp_handler, killtcp);
2941                 tevent_fd_set_auto_close(killtcp->fde);
2942
2943                 /* We also need to set up some events to tickle all these connections
2944                    until they are all reset
2945                 */
2946                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2947                                 ctdb_tickle_sentenced_connections, killtcp);
2948         }
2949
2950         /* tickle him once now */
2951         ctdb_sys_send_tcp(
2952                 &con->dst_addr,
2953                 &con->src_addr,
2954                 0, 0, 0);
2955
2956         return 0;
2957
2958 failed:
2959         talloc_free(vnn->killtcp);
2960         vnn->killtcp = NULL;
2961         return -1;
2962 }
2963
2964 /*
2965   kill a TCP connection.
2966  */
2967 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2968 {
2969         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2970
2971         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2972 }
2973
2974 /*
2975   called by a daemon to inform us of the entire list of TCP tickles for
2976   a particular public address.
2977   this control should only be sent by the node that is currently serving
2978   that public address.
2979  */
2980 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2981 {
2982         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2983         struct ctdb_tcp_array *tcparray;
2984         struct ctdb_vnn *vnn;
2985
2986         /* We must at least have tickles.num or else we cant verify the size
2987            of the received data blob
2988          */
2989         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2990                                         tickles.connections)) {
2991                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2992                 return -1;
2993         }
2994
2995         /* verify that the size of data matches what we expect */
2996         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2997                                 tickles.connections)
2998                          + sizeof(struct ctdb_tcp_connection)
2999                                  * list->tickles.num) {
3000                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3001                 return -1;
3002         }       
3003
3004         vnn = find_public_ip_vnn(ctdb, &list->addr);
3005         if (vnn == NULL) {
3006                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3007                         ctdb_addr_to_str(&list->addr)));
3008
3009                 return 1;
3010         }
3011
3012         /* remove any old ticklelist we might have */
3013         talloc_free(vnn->tcp_array);
3014         vnn->tcp_array = NULL;
3015
3016         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3017         CTDB_NO_MEMORY(ctdb, tcparray);
3018
3019         tcparray->num = list->tickles.num;
3020
3021         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3022         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3023
3024         memcpy(tcparray->connections, &list->tickles.connections[0], 
3025                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3026
3027         /* We now have a new fresh tickle list array for this vnn */
3028         vnn->tcp_array = talloc_steal(vnn, tcparray);
3029         
3030         return 0;
3031 }
3032
3033 /*
3034   called to return the full list of tickles for the puclic address associated 
3035   with the provided vnn
3036  */
3037 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3038 {
3039         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3040         struct ctdb_control_tcp_tickle_list *list;
3041         struct ctdb_tcp_array *tcparray;
3042         int num;
3043         struct ctdb_vnn *vnn;
3044
3045         vnn = find_public_ip_vnn(ctdb, addr);
3046         if (vnn == NULL) {
3047                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3048                         ctdb_addr_to_str(addr)));
3049
3050                 return 1;
3051         }
3052
3053         tcparray = vnn->tcp_array;
3054         if (tcparray) {
3055                 num = tcparray->num;
3056         } else {
3057                 num = 0;
3058         }
3059
3060         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3061                                 tickles.connections)
3062                         + sizeof(struct ctdb_tcp_connection) * num;
3063
3064         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3065         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3066         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3067
3068         list->addr = *addr;
3069         list->tickles.num = num;
3070         if (num) {
3071                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3072                         sizeof(struct ctdb_tcp_connection) * num);
3073         }
3074
3075         return 0;
3076 }
3077
3078
3079 /*
3080   set the list of all tcp tickles for a public address
3081  */
3082 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3083                               struct timeval timeout, uint32_t destnode, 
3084                               ctdb_sock_addr *addr,
3085                               struct ctdb_tcp_array *tcparray)
3086 {
3087         int ret, num;
3088         TDB_DATA data;
3089         struct ctdb_control_tcp_tickle_list *list;
3090
3091         if (tcparray) {
3092                 num = tcparray->num;
3093         } else {
3094                 num = 0;
3095         }
3096
3097         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3098                                 tickles.connections) +
3099                         sizeof(struct ctdb_tcp_connection) * num;
3100         data.dptr = talloc_size(ctdb, data.dsize);
3101         CTDB_NO_MEMORY(ctdb, data.dptr);
3102
3103         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3104         list->addr = *addr;
3105         list->tickles.num = num;
3106         if (tcparray) {
3107                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3108         }
3109
3110         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3111                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3112                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3113         if (ret != 0) {
3114                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3115                 return -1;
3116         }
3117
3118         talloc_free(data.dptr);
3119
3120         return ret;
3121 }
3122
3123
3124 /*
3125   perform tickle updates if required
3126  */
3127 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3128                                 struct timed_event *te, 
3129                                 struct timeval t, void *private_data)
3130 {
3131         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3132         int ret;
3133         struct ctdb_vnn *vnn;
3134
3135         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3136                 /* we only send out updates for public addresses that 
3137                    we have taken over
3138                  */
3139                 if (ctdb->pnn != vnn->pnn) {
3140                         continue;
3141                 }
3142                 /* We only send out the updates if we need to */
3143                 if (!vnn->tcp_update_needed) {
3144                         continue;
3145                 }
3146                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3147                                 TAKEOVER_TIMEOUT(),
3148                                 CTDB_BROADCAST_CONNECTED,
3149                                 &vnn->public_address,
3150                                 vnn->tcp_array);
3151                 if (ret != 0) {
3152                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3153                                 ctdb_addr_to_str(&vnn->public_address)));
3154                 }
3155         }
3156
3157         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3158                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3159                              ctdb_update_tcp_tickles, ctdb);
3160 }               
3161         
3162
3163 /*
3164   start periodic update of tcp tickles
3165  */
3166 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3167 {
3168         ctdb->tickle_update_context = talloc_new(ctdb);
3169
3170         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3171                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3172                              ctdb_update_tcp_tickles, ctdb);
3173 }
3174
3175
3176
3177
3178 struct control_gratious_arp {
3179         struct ctdb_context *ctdb;
3180         ctdb_sock_addr addr;
3181         const char *iface;
3182         int count;
3183 };
3184
3185 /*
3186   send a control_gratuitous arp
3187  */
3188 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3189                                   struct timeval t, void *private_data)
3190 {
3191         int ret;
3192         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3193                                                         struct control_gratious_arp);
3194
3195         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3196         if (ret != 0) {
3197                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3198                                  arp->iface, strerror(errno)));
3199         }
3200
3201
3202         arp->count++;
3203         if (arp->count == CTDB_ARP_REPEAT) {
3204                 talloc_free(arp);
3205                 return;
3206         }
3207
3208         event_add_timed(arp->ctdb->ev, arp, 
3209                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3210                         send_gratious_arp, arp);
3211 }
3212
3213
3214 /*
3215   send a gratious arp 
3216  */
3217 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3218 {
3219         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3220         struct control_gratious_arp *arp;
3221
3222         /* verify the size of indata */
3223         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3224                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3225                                  (unsigned)indata.dsize, 
3226                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3227                 return -1;
3228         }
3229         if (indata.dsize != 
3230                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3231                 + gratious_arp->len ) ){
3232
3233                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3234                         "but should be %u bytes\n", 
3235                          (unsigned)indata.dsize, 
3236                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3237                 return -1;
3238         }
3239
3240
3241         arp = talloc(ctdb, struct control_gratious_arp);
3242         CTDB_NO_MEMORY(ctdb, arp);
3243
3244         arp->ctdb  = ctdb;
3245         arp->addr   = gratious_arp->addr;
3246         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3247         CTDB_NO_MEMORY(ctdb, arp->iface);
3248         arp->count = 0;
3249         
3250         event_add_timed(arp->ctdb->ev, arp, 
3251                         timeval_zero(), send_gratious_arp, arp);
3252
3253         return 0;
3254 }
3255
3256 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3257 {
3258         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3259         int ret;
3260
3261         /* verify the size of indata */
3262         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3263                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3264                 return -1;
3265         }
3266         if (indata.dsize != 
3267                 ( offsetof(struct ctdb_control_ip_iface, iface)
3268                 + pub->len ) ){
3269
3270                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3271                         "but should be %u bytes\n", 
3272                          (unsigned)indata.dsize, 
3273                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3274                 return -1;
3275         }
3276
3277         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3278
3279         if (ret != 0) {
3280                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3281                 return -1;
3282         }
3283
3284         return 0;
3285 }
3286
3287 /*
3288   called when releaseip event finishes for del_public_address
3289  */
3290 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3291                                 void *private_data)
3292 {
3293         talloc_free(private_data);
3294 }
3295
3296 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3297 {
3298         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3299         struct ctdb_vnn *vnn;
3300         int ret;
3301
3302         /* verify the size of indata */
3303         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3304                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3305                 return -1;
3306         }
3307         if (indata.dsize != 
3308                 ( offsetof(struct ctdb_control_ip_iface, iface)
3309                 + pub->len ) ){
3310
3311                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3312                         "but should be %u bytes\n", 
3313                          (unsigned)indata.dsize, 
3314                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3315                 return -1;
3316         }
3317
3318         /* walk over all public addresses until we find a match */
3319         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3320                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3321                         TALLOC_CTX *mem_ctx;
3322
3323                         DLIST_REMOVE(ctdb->vnn, vnn);
3324                         if (vnn->iface != NULL) {
3325                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3326                         }
3327                         if (vnn->pnn != ctdb->pnn) {
3328                                 talloc_free(vnn);
3329                                 return 0;
3330                         }
3331
3332                         mem_ctx = talloc_new(ctdb);
3333                         talloc_steal(mem_ctx, vnn);
3334                         ret = ctdb_event_script_callback(ctdb, 
3335                                          mem_ctx, delete_ip_callback, mem_ctx,
3336                                          false,
3337                                          CTDB_EVENT_RELEASE_IP,
3338                                          "%s %s %u",
3339                                          ctdb_vnn_iface_string(vnn),
3340                                          ctdb_addr_to_str(&vnn->public_address),
3341                                          vnn->public_netmask_bits);
3342                         if (ret != 0) {
3343                                 return -1;
3344                         }
3345                         return 0;
3346                 }
3347         }
3348
3349         return -1;
3350 }
3351
3352 /* This function is called from the recovery daemon to verify that a remote
3353    node has the expected ip allocation.
3354    This is verified against ctdb->ip_tree
3355 */
3356 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3357 {
3358         struct ctdb_public_ip_list *tmp_ip; 
3359         int i;
3360
3361         if (ctdb->ip_tree == NULL) {
3362                 /* dont know the expected allocation yet, assume remote node
3363                    is correct. */
3364                 return 0;
3365         }
3366
3367         if (ips == NULL) {
3368                 return 0;
3369         }
3370
3371         for (i=0; i<ips->num; i++) {
3372                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3373                 if (tmp_ip == NULL) {
3374                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3375                         return -1;
3376                 }
3377
3378                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3379                         continue;
3380                 }
3381
3382                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3383                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3384                         return -1;
3385                 }
3386         }
3387
3388         return 0;
3389 }
3390
3391 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3392 {
3393         struct ctdb_public_ip_list *tmp_ip; 
3394
3395         if (ctdb->ip_tree == NULL) {
3396                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3397                 return -1;
3398         }
3399
3400         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3401         if (tmp_ip == NULL) {
3402                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3403                 return -1;
3404         }
3405
3406         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3407         tmp_ip->pnn = ip->pnn;
3408
3409         return 0;
3410 }