recoverd: When calculating rebalance candidates don't consider flags
[obnox/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
29
30
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
32
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT   3
35
36 struct ctdb_iface {
37         struct ctdb_iface *prev, *next;
38         const char *name;
39         bool link_up;
40         uint32_t references;
41 };
42
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
44 {
45         if (vnn->iface) {
46                 return vnn->iface->name;
47         }
48
49         return "__none__";
50 }
51
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
53 {
54         struct ctdb_iface *i;
55
56         /* Verify that we dont have an entry for this ip yet */
57         for (i=ctdb->ifaces;i;i=i->next) {
58                 if (strcmp(i->name, iface) == 0) {
59                         return 0;
60                 }
61         }
62
63         /* create a new structure for this interface */
64         i = talloc_zero(ctdb, struct ctdb_iface);
65         CTDB_NO_MEMORY_FATAL(ctdb, i);
66         i->name = talloc_strdup(i, iface);
67         CTDB_NO_MEMORY(ctdb, i->name);
68         /*
69          * If link_up defaults to true then IPs can be allocated to a
70          * node during the first recovery.  However, then an interface
71          * could have its link marked down during the startup event,
72          * causing the IP to move almost immediately.  If link_up
73          * defaults to false then, during normal operation, IPs added
74          * to a new interface can't be assigned until a monitor cycle
75          * has occurred and marked the new interfaces up.  This makes
76          * IP allocation unpredictable.  The following is a neat
77          * compromise: early in startup link_up defaults to false, so
78          * IPs can't be assigned, and after startup IPs can be
79          * assigned immediately.
80          */
81         i->link_up = ctdb->done_startup;
82
83         DLIST_ADD(ctdb->ifaces, i);
84
85         return 0;
86 }
87
88 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
89                                         const char *name)
90 {
91         int n;
92
93         for (n = 0; vnn->ifaces[n] != NULL; n++) {
94                 if (strcmp(name, vnn->ifaces[n]) == 0) {
95                         return true;
96                 }
97         }
98
99         return false;
100 }
101
102 /* If any interfaces now have no possible IPs then delete them.  This
103  * implementation is naive (i.e. simple) rather than clever
104  * (i.e. complex).  Given that this is run on delip and that operation
105  * is rare, this doesn't need to be efficient - it needs to be
106  * foolproof.  One alternative is reference counting, where the logic
107  * is distributed and can, therefore, be broken in multiple places.
108  * Another alternative is to build a red-black tree of interfaces that
109  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
110  * once) and then walking ctdb->ifaces once and deleting those not in
111  * the tree.  Let's go to one of those if the naive implementation
112  * causes problems...  :-)
113  */
114 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
115                                         struct ctdb_vnn *vnn,
116                                         TALLOC_CTX *mem_ctx)
117 {
118         struct ctdb_iface *i;
119
120         /* For each interface, check if there's an IP using it. */
121         for(i=ctdb->ifaces; i; i=i->next) {
122                 struct ctdb_vnn *tv;
123                 bool found;
124
125                 /* Only consider interfaces named in the given VNN. */
126                 if (!vnn_has_interface_with_name(vnn, i->name)) {
127                         continue;
128                 }
129
130                 /* Is the "single IP" on this interface? */
131                 if ((ctdb->single_ip_vnn != NULL) &&
132                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
133                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
134                         /* Found, next interface please... */
135                         continue;
136                 }
137                 /* Search for a vnn with this interface. */
138                 found = false;
139                 for (tv=ctdb->vnn; tv; tv=tv->next) {
140                         if (vnn_has_interface_with_name(tv, i->name)) {
141                                 found = true;
142                                 break;
143                         }
144                 }
145
146                 if (!found) {
147                         /* None of the VNNs are using this interface. */
148                         DLIST_REMOVE(ctdb->ifaces, i);
149                         /* Caller will free mem_ctx when convenient. */
150                         talloc_steal(mem_ctx, i);
151                 }
152         }
153 }
154
155
156 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
157                                           const char *iface)
158 {
159         struct ctdb_iface *i;
160
161         /* Verify that we dont have an entry for this ip yet */
162         for (i=ctdb->ifaces;i;i=i->next) {
163                 if (strcmp(i->name, iface) == 0) {
164                         return i;
165                 }
166         }
167
168         return NULL;
169 }
170
171 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
172                                               struct ctdb_vnn *vnn)
173 {
174         int i;
175         struct ctdb_iface *cur = NULL;
176         struct ctdb_iface *best = NULL;
177
178         for (i=0; vnn->ifaces[i]; i++) {
179
180                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
181                 if (cur == NULL) {
182                         continue;
183                 }
184
185                 if (!cur->link_up) {
186                         continue;
187                 }
188
189                 if (best == NULL) {
190                         best = cur;
191                         continue;
192                 }
193
194                 if (cur->references < best->references) {
195                         best = cur;
196                         continue;
197                 }
198         }
199
200         return best;
201 }
202
203 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
204                                      struct ctdb_vnn *vnn)
205 {
206         struct ctdb_iface *best = NULL;
207
208         if (vnn->iface) {
209                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
210                                    "still assigned to iface '%s'\n",
211                                    ctdb_addr_to_str(&vnn->public_address),
212                                    ctdb_vnn_iface_string(vnn)));
213                 return 0;
214         }
215
216         best = ctdb_vnn_best_iface(ctdb, vnn);
217         if (best == NULL) {
218                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
219                                   "cannot assign to iface any iface\n",
220                                   ctdb_addr_to_str(&vnn->public_address)));
221                 return -1;
222         }
223
224         vnn->iface = best;
225         best->references++;
226         vnn->pnn = ctdb->pnn;
227
228         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
229                            "now assigned to iface '%s' refs[%d]\n",
230                            ctdb_addr_to_str(&vnn->public_address),
231                            ctdb_vnn_iface_string(vnn),
232                            best->references));
233         return 0;
234 }
235
236 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
237                                     struct ctdb_vnn *vnn)
238 {
239         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
240                            "now unassigned (old iface '%s' refs[%d])\n",
241                            ctdb_addr_to_str(&vnn->public_address),
242                            ctdb_vnn_iface_string(vnn),
243                            vnn->iface?vnn->iface->references:0));
244         if (vnn->iface) {
245                 vnn->iface->references--;
246         }
247         vnn->iface = NULL;
248         if (vnn->pnn == ctdb->pnn) {
249                 vnn->pnn = -1;
250         }
251 }
252
253 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
254                                struct ctdb_vnn *vnn)
255 {
256         int i;
257
258         if (vnn->iface && vnn->iface->link_up) {
259                 return true;
260         }
261
262         for (i=0; vnn->ifaces[i]; i++) {
263                 struct ctdb_iface *cur;
264
265                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
266                 if (cur == NULL) {
267                         continue;
268                 }
269
270                 if (cur->link_up) {
271                         return true;
272                 }
273         }
274
275         return false;
276 }
277
278 struct ctdb_takeover_arp {
279         struct ctdb_context *ctdb;
280         uint32_t count;
281         ctdb_sock_addr addr;
282         struct ctdb_tcp_array *tcparray;
283         struct ctdb_vnn *vnn;
284 };
285
286
287 /*
288   lists of tcp endpoints
289  */
290 struct ctdb_tcp_list {
291         struct ctdb_tcp_list *prev, *next;
292         struct ctdb_tcp_connection connection;
293 };
294
295 /*
296   list of clients to kill on IP release
297  */
298 struct ctdb_client_ip {
299         struct ctdb_client_ip *prev, *next;
300         struct ctdb_context *ctdb;
301         ctdb_sock_addr addr;
302         uint32_t client_id;
303 };
304
305
306 /*
307   send a gratuitous arp
308  */
309 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
310                                   struct timeval t, void *private_data)
311 {
312         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
313                                                         struct ctdb_takeover_arp);
314         int i, ret;
315         struct ctdb_tcp_array *tcparray;
316         const char *iface = ctdb_vnn_iface_string(arp->vnn);
317
318         ret = ctdb_sys_send_arp(&arp->addr, iface);
319         if (ret != 0) {
320                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
321                                   iface, strerror(errno)));
322         }
323
324         tcparray = arp->tcparray;
325         if (tcparray) {
326                 for (i=0;i<tcparray->num;i++) {
327                         struct ctdb_tcp_connection *tcon;
328
329                         tcon = &tcparray->connections[i];
330                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
331                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
332                                 ctdb_addr_to_str(&tcon->src_addr),
333                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
334                         ret = ctdb_sys_send_tcp(
335                                 &tcon->src_addr, 
336                                 &tcon->dst_addr,
337                                 0, 0, 0);
338                         if (ret != 0) {
339                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
340                                         ctdb_addr_to_str(&tcon->src_addr)));
341                         }
342                 }
343         }
344
345         arp->count++;
346
347         if (arp->count == CTDB_ARP_REPEAT) {
348                 talloc_free(arp);
349                 return;
350         }
351
352         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
353                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
354                         ctdb_control_send_arp, arp);
355 }
356
357 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
358                                        struct ctdb_vnn *vnn)
359 {
360         struct ctdb_takeover_arp *arp;
361         struct ctdb_tcp_array *tcparray;
362
363         if (!vnn->takeover_ctx) {
364                 vnn->takeover_ctx = talloc_new(vnn);
365                 if (!vnn->takeover_ctx) {
366                         return -1;
367                 }
368         }
369
370         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
371         if (!arp) {
372                 return -1;
373         }
374
375         arp->ctdb = ctdb;
376         arp->addr = vnn->public_address;
377         arp->vnn  = vnn;
378
379         tcparray = vnn->tcp_array;
380         if (tcparray) {
381                 /* add all of the known tcp connections for this IP to the
382                    list of tcp connections to send tickle acks for */
383                 arp->tcparray = talloc_steal(arp, tcparray);
384
385                 vnn->tcp_array = NULL;
386                 vnn->tcp_update_needed = true;
387         }
388
389         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
390                         timeval_zero(), ctdb_control_send_arp, arp);
391
392         return 0;
393 }
394
395 struct takeover_callback_state {
396         struct ctdb_req_control *c;
397         ctdb_sock_addr *addr;
398         struct ctdb_vnn *vnn;
399 };
400
401 struct ctdb_do_takeip_state {
402         struct ctdb_req_control *c;
403         struct ctdb_vnn *vnn;
404 };
405
406 /*
407   called when takeip event finishes
408  */
409 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
410                                     void *private_data)
411 {
412         struct ctdb_do_takeip_state *state =
413                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
414         int32_t ret;
415         TDB_DATA data;
416
417         if (status != 0) {
418                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
419         
420                 if (status == -ETIME) {
421                         ctdb_ban_self(ctdb);
422                 }
423                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
424                                  ctdb_addr_to_str(&state->vnn->public_address),
425                                  ctdb_vnn_iface_string(state->vnn)));
426                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
427
428                 node->flags |= NODE_FLAGS_UNHEALTHY;
429                 talloc_free(state);
430                 return;
431         }
432
433         if (ctdb->do_checkpublicip) {
434
435         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
436         if (ret != 0) {
437                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
438                 talloc_free(state);
439                 return;
440         }
441
442         }
443
444         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
445         data.dsize = strlen((char *)data.dptr) + 1;
446         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
447
448         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
449
450
451         /* the control succeeded */
452         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
453         talloc_free(state);
454         return;
455 }
456
457 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
458 {
459         state->vnn->update_in_flight = false;
460         return 0;
461 }
462
463 /*
464   take over an ip address
465  */
466 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
467                               struct ctdb_req_control *c,
468                               struct ctdb_vnn *vnn)
469 {
470         int ret;
471         struct ctdb_do_takeip_state *state;
472
473         if (vnn->update_in_flight) {
474                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
475                                     "update for this IP already in flight\n",
476                                     ctdb_addr_to_str(&vnn->public_address),
477                                     vnn->public_netmask_bits));
478                 return -1;
479         }
480
481         ret = ctdb_vnn_assign_iface(ctdb, vnn);
482         if (ret != 0) {
483                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
484                                  "assign a usable interface\n",
485                                  ctdb_addr_to_str(&vnn->public_address),
486                                  vnn->public_netmask_bits));
487                 return -1;
488         }
489
490         state = talloc(vnn, struct ctdb_do_takeip_state);
491         CTDB_NO_MEMORY(ctdb, state);
492
493         state->c = talloc_steal(ctdb, c);
494         state->vnn   = vnn;
495
496         vnn->update_in_flight = true;
497         talloc_set_destructor(state, ctdb_takeip_destructor);
498
499         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
500                             ctdb_addr_to_str(&vnn->public_address),
501                             vnn->public_netmask_bits,
502                             ctdb_vnn_iface_string(vnn)));
503
504         ret = ctdb_event_script_callback(ctdb,
505                                          state,
506                                          ctdb_do_takeip_callback,
507                                          state,
508                                          false,
509                                          CTDB_EVENT_TAKE_IP,
510                                          "%s %s %u",
511                                          ctdb_vnn_iface_string(vnn),
512                                          ctdb_addr_to_str(&vnn->public_address),
513                                          vnn->public_netmask_bits);
514
515         if (ret != 0) {
516                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
517                         ctdb_addr_to_str(&vnn->public_address),
518                         ctdb_vnn_iface_string(vnn)));
519                 talloc_free(state);
520                 return -1;
521         }
522
523         return 0;
524 }
525
526 struct ctdb_do_updateip_state {
527         struct ctdb_req_control *c;
528         struct ctdb_iface *old;
529         struct ctdb_vnn *vnn;
530 };
531
532 /*
533   called when updateip event finishes
534  */
535 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
536                                       void *private_data)
537 {
538         struct ctdb_do_updateip_state *state =
539                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
540         int32_t ret;
541
542         if (status != 0) {
543                 if (status == -ETIME) {
544                         ctdb_ban_self(ctdb);
545                 }
546                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
547                         ctdb_addr_to_str(&state->vnn->public_address),
548                         state->old->name,
549                         ctdb_vnn_iface_string(state->vnn)));
550
551                 /*
552                  * All we can do is reset the old interface
553                  * and let the next run fix it
554                  */
555                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
556                 state->vnn->iface = state->old;
557                 state->vnn->iface->references++;
558
559                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
560                 talloc_free(state);
561                 return;
562         }
563
564         if (ctdb->do_checkpublicip) {
565
566         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
567         if (ret != 0) {
568                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
569                 talloc_free(state);
570                 return;
571         }
572
573         }
574
575         /* the control succeeded */
576         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
577         talloc_free(state);
578         return;
579 }
580
581 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
582 {
583         state->vnn->update_in_flight = false;
584         return 0;
585 }
586
587 /*
588   update (move) an ip address
589  */
590 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
591                                 struct ctdb_req_control *c,
592                                 struct ctdb_vnn *vnn)
593 {
594         int ret;
595         struct ctdb_do_updateip_state *state;
596         struct ctdb_iface *old = vnn->iface;
597         const char *new_name;
598
599         if (vnn->update_in_flight) {
600                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
601                                     "update for this IP already in flight\n",
602                                     ctdb_addr_to_str(&vnn->public_address),
603                                     vnn->public_netmask_bits));
604                 return -1;
605         }
606
607         ctdb_vnn_unassign_iface(ctdb, vnn);
608         ret = ctdb_vnn_assign_iface(ctdb, vnn);
609         if (ret != 0) {
610                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
611                                  "assin a usable interface (old iface '%s')\n",
612                                  ctdb_addr_to_str(&vnn->public_address),
613                                  vnn->public_netmask_bits,
614                                  old->name));
615                 return -1;
616         }
617
618         new_name = ctdb_vnn_iface_string(vnn);
619         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
620                 /* A benign update from one interface onto itself.
621                  * no need to run the eventscripts in this case, just return
622                  * success.
623                  */
624                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
625                 return 0;
626         }
627
628         state = talloc(vnn, struct ctdb_do_updateip_state);
629         CTDB_NO_MEMORY(ctdb, state);
630
631         state->c = talloc_steal(ctdb, c);
632         state->old = old;
633         state->vnn = vnn;
634
635         vnn->update_in_flight = true;
636         talloc_set_destructor(state, ctdb_updateip_destructor);
637
638         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
639                             "interface %s to %s\n",
640                             ctdb_addr_to_str(&vnn->public_address),
641                             vnn->public_netmask_bits,
642                             old->name,
643                             new_name));
644
645         ret = ctdb_event_script_callback(ctdb,
646                                          state,
647                                          ctdb_do_updateip_callback,
648                                          state,
649                                          false,
650                                          CTDB_EVENT_UPDATE_IP,
651                                          "%s %s %s %u",
652                                          state->old->name,
653                                          new_name,
654                                          ctdb_addr_to_str(&vnn->public_address),
655                                          vnn->public_netmask_bits);
656         if (ret != 0) {
657                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
658                                  ctdb_addr_to_str(&vnn->public_address),
659                                  old->name, new_name));
660                 talloc_free(state);
661                 return -1;
662         }
663
664         return 0;
665 }
666
667 /*
668   Find the vnn of the node that has a public ip address
669   returns -1 if the address is not known as a public address
670  */
671 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
672 {
673         struct ctdb_vnn *vnn;
674
675         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
676                 if (ctdb_same_ip(&vnn->public_address, addr)) {
677                         return vnn;
678                 }
679         }
680
681         return NULL;
682 }
683
684 /*
685   take over an ip address
686  */
687 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
688                                  struct ctdb_req_control *c,
689                                  TDB_DATA indata,
690                                  bool *async_reply)
691 {
692         int ret;
693         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
694         struct ctdb_vnn *vnn;
695         bool have_ip = false;
696         bool do_updateip = false;
697         bool do_takeip = false;
698         struct ctdb_iface *best_iface = NULL;
699
700         if (pip->pnn != ctdb->pnn) {
701                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
702                                  "with pnn %d, but we're node %d\n",
703                                  ctdb_addr_to_str(&pip->addr),
704                                  pip->pnn, ctdb->pnn));
705                 return -1;
706         }
707
708         /* update out vnn list */
709         vnn = find_public_ip_vnn(ctdb, &pip->addr);
710         if (vnn == NULL) {
711                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
712                         ctdb_addr_to_str(&pip->addr)));
713                 return 0;
714         }
715
716         if (ctdb->do_checkpublicip) {
717                 have_ip = ctdb_sys_have_ip(&pip->addr);
718         }
719         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
720         if (best_iface == NULL) {
721                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
722                                  "a usable interface (old %s, have_ip %d)\n",
723                                  ctdb_addr_to_str(&vnn->public_address),
724                                  vnn->public_netmask_bits,
725                                  ctdb_vnn_iface_string(vnn),
726                                  have_ip));
727                 return -1;
728         }
729
730         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
731                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
732                 have_ip = false;
733         }
734
735
736         if (vnn->iface == NULL && have_ip) {
737                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
738                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
739                                  ctdb_addr_to_str(&vnn->public_address)));
740                 return 0;
741         }
742
743         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
744                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745                                   "and we have it on iface[%s], but it was assigned to node %d"
746                                   "and we are node %d, banning ourself\n",
747                                  ctdb_addr_to_str(&vnn->public_address),
748                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
749                 ctdb_ban_self(ctdb);
750                 return -1;
751         }
752
753         if (vnn->pnn == -1 && have_ip) {
754                 vnn->pnn = ctdb->pnn;
755                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
756                                   "and we already have it on iface[%s], update local daemon\n",
757                                  ctdb_addr_to_str(&vnn->public_address),
758                                   ctdb_vnn_iface_string(vnn)));
759                 return 0;
760         }
761
762         if (vnn->iface) {
763                 if (vnn->iface != best_iface) {
764                         if (!vnn->iface->link_up) {
765                                 do_updateip = true;
766                         } else if (vnn->iface->references > (best_iface->references + 1)) {
767                                 /* only move when the rebalance gains something */
768                                         do_updateip = true;
769                         }
770                 }
771         }
772
773         if (!have_ip) {
774                 if (do_updateip) {
775                         ctdb_vnn_unassign_iface(ctdb, vnn);
776                         do_updateip = false;
777                 }
778                 do_takeip = true;
779         }
780
781         if (do_takeip) {
782                 ret = ctdb_do_takeip(ctdb, c, vnn);
783                 if (ret != 0) {
784                         return -1;
785                 }
786         } else if (do_updateip) {
787                 ret = ctdb_do_updateip(ctdb, c, vnn);
788                 if (ret != 0) {
789                         return -1;
790                 }
791         } else {
792                 /*
793                  * The interface is up and the kernel known the ip
794                  * => do nothing
795                  */
796                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
797                         ctdb_addr_to_str(&pip->addr),
798                         vnn->public_netmask_bits,
799                         ctdb_vnn_iface_string(vnn)));
800                 return 0;
801         }
802
803         /* tell ctdb_control.c that we will be replying asynchronously */
804         *async_reply = true;
805
806         return 0;
807 }
808
809 /*
810   takeover an ip address old v4 style
811  */
812 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
813                                 struct ctdb_req_control *c,
814                                 TDB_DATA indata, 
815                                 bool *async_reply)
816 {
817         TDB_DATA data;
818         
819         data.dsize = sizeof(struct ctdb_public_ip);
820         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
821         CTDB_NO_MEMORY(ctdb, data.dptr);
822         
823         memcpy(data.dptr, indata.dptr, indata.dsize);
824         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
825 }
826
827 /*
828   kill any clients that are registered with a IP that is being released
829  */
830 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
831 {
832         struct ctdb_client_ip *ip;
833
834         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
835                 ctdb_addr_to_str(addr)));
836
837         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
838                 ctdb_sock_addr tmp_addr;
839
840                 tmp_addr = ip->addr;
841                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
842                         ip->client_id,
843                         ctdb_addr_to_str(&ip->addr)));
844
845                 if (ctdb_same_ip(&tmp_addr, addr)) {
846                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
847                                                                      ip->client_id, 
848                                                                      struct ctdb_client);
849                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
850                                 ip->client_id,
851                                 ctdb_addr_to_str(&ip->addr),
852                                 client->pid));
853
854                         if (client->pid != 0) {
855                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
856                                         (unsigned)client->pid,
857                                         ctdb_addr_to_str(addr),
858                                         ip->client_id));
859                                 ctdb_kill(ctdb, client->pid, SIGKILL);
860                         }
861                 }
862         }
863 }
864
865 /*
866   called when releaseip event finishes
867  */
868 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
869                                 void *private_data)
870 {
871         struct takeover_callback_state *state = 
872                 talloc_get_type(private_data, struct takeover_callback_state);
873         TDB_DATA data;
874
875         if (status == -ETIME) {
876                 ctdb_ban_self(ctdb);
877         }
878
879         /* send a message to all clients of this node telling them
880            that the cluster has been reconfigured and they should
881            release any sockets on this IP */
882         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
883         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
884         data.dsize = strlen((char *)data.dptr)+1;
885
886         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
887
888         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
889
890         /* kill clients that have registered with this IP */
891         release_kill_clients(ctdb, state->addr);
892
893         ctdb_vnn_unassign_iface(ctdb, state->vnn);
894
895         /* the control succeeded */
896         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
897         talloc_free(state);
898 }
899
900 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
901 {
902         state->vnn->update_in_flight = false;
903         return 0;
904 }
905
906 /*
907   release an ip address
908  */
909 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
910                                 struct ctdb_req_control *c,
911                                 TDB_DATA indata, 
912                                 bool *async_reply)
913 {
914         int ret;
915         struct takeover_callback_state *state;
916         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
917         struct ctdb_vnn *vnn;
918         char *iface;
919
920         /* update our vnn list */
921         vnn = find_public_ip_vnn(ctdb, &pip->addr);
922         if (vnn == NULL) {
923                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
924                         ctdb_addr_to_str(&pip->addr)));
925                 return 0;
926         }
927         vnn->pnn = pip->pnn;
928
929         /* stop any previous arps */
930         talloc_free(vnn->takeover_ctx);
931         vnn->takeover_ctx = NULL;
932
933         /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
934          * lazy multicast to drop an IP from any node that isn't the
935          * intended new node.  The following causes makes ctdbd ignore
936          * a release for any address it doesn't host.
937          */
938         if (ctdb->do_checkpublicip) {
939                 if (!ctdb_sys_have_ip(&pip->addr)) {
940                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
941                                 ctdb_addr_to_str(&pip->addr),
942                                 vnn->public_netmask_bits,
943                                 ctdb_vnn_iface_string(vnn)));
944                         ctdb_vnn_unassign_iface(ctdb, vnn);
945                         return 0;
946                 }
947         } else {
948                 if (vnn->iface == NULL) {
949                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
950                                            ctdb_addr_to_str(&pip->addr),
951                                            vnn->public_netmask_bits));
952                         return 0;
953                 }
954         }
955
956         /* There is a potential race between take_ip and us because we
957          * update the VNN via a callback that run when the
958          * eventscripts have been run.  Avoid the race by allowing one
959          * update to be in flight at a time.
960          */
961         if (vnn->update_in_flight) {
962                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
963                                     "update for this IP already in flight\n",
964                                     ctdb_addr_to_str(&vnn->public_address),
965                                     vnn->public_netmask_bits));
966                 return -1;
967         }
968
969         if (ctdb->do_checkpublicip) {
970                 iface = ctdb_sys_find_ifname(&pip->addr);
971                 if (iface == NULL) {
972                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
973                         return 0;
974                 }
975         } else {
976                 iface = strdup(ctdb_vnn_iface_string(vnn));
977         }
978
979         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
980                 ctdb_addr_to_str(&pip->addr),
981                 vnn->public_netmask_bits,
982                 iface,
983                 pip->pnn));
984
985         state = talloc(ctdb, struct takeover_callback_state);
986         CTDB_NO_MEMORY(ctdb, state);
987
988         state->c = talloc_steal(state, c);
989         state->addr = talloc(state, ctdb_sock_addr);       
990         CTDB_NO_MEMORY(ctdb, state->addr);
991         *state->addr = pip->addr;
992         state->vnn   = vnn;
993
994         vnn->update_in_flight = true;
995         talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997         ret = ctdb_event_script_callback(ctdb, 
998                                          state, release_ip_callback, state,
999                                          false,
1000                                          CTDB_EVENT_RELEASE_IP,
1001                                          "%s %s %u",
1002                                          iface,
1003                                          ctdb_addr_to_str(&pip->addr),
1004                                          vnn->public_netmask_bits);
1005         free(iface);
1006         if (ret != 0) {
1007                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1008                         ctdb_addr_to_str(&pip->addr),
1009                         ctdb_vnn_iface_string(vnn)));
1010                 talloc_free(state);
1011                 return -1;
1012         }
1013
1014         /* tell the control that we will be reply asynchronously */
1015         *async_reply = true;
1016         return 0;
1017 }
1018
1019 /*
1020   release an ip address old v4 style
1021  */
1022 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
1023                                 struct ctdb_req_control *c,
1024                                 TDB_DATA indata, 
1025                                 bool *async_reply)
1026 {
1027         TDB_DATA data;
1028         
1029         data.dsize = sizeof(struct ctdb_public_ip);
1030         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1031         CTDB_NO_MEMORY(ctdb, data.dptr);
1032         
1033         memcpy(data.dptr, indata.dptr, indata.dsize);
1034         return ctdb_control_release_ip(ctdb, c, data, async_reply);
1035 }
1036
1037
1038 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1039                                    ctdb_sock_addr *addr,
1040                                    unsigned mask, const char *ifaces,
1041                                    bool check_address)
1042 {
1043         struct ctdb_vnn      *vnn;
1044         uint32_t num = 0;
1045         char *tmp;
1046         const char *iface;
1047         int i;
1048         int ret;
1049
1050         tmp = strdup(ifaces);
1051         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1052                 if (!ctdb_sys_check_iface_exists(iface)) {
1053                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1054                         free(tmp);
1055                         return -1;
1056                 }
1057         }
1058         free(tmp);
1059
1060         /* Verify that we dont have an entry for this ip yet */
1061         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1062                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1063                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1064                                 ctdb_addr_to_str(addr)));
1065                         return -1;
1066                 }               
1067         }
1068
1069         /* create a new vnn structure for this ip address */
1070         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1071         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1072         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1073         tmp = talloc_strdup(vnn, ifaces);
1074         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1075         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1077                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1078                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1079                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1080                 num++;
1081         }
1082         talloc_free(tmp);
1083         vnn->ifaces[num] = NULL;
1084         vnn->public_address      = *addr;
1085         vnn->public_netmask_bits = mask;
1086         vnn->pnn                 = -1;
1087         if (check_address) {
1088                 if (ctdb_sys_have_ip(addr)) {
1089                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1090                         vnn->pnn = ctdb->pnn;
1091                 }
1092         }
1093
1094         for (i=0; vnn->ifaces[i]; i++) {
1095                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1096                 if (ret != 0) {
1097                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1098                                            "for public_address[%s]\n",
1099                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1100                         talloc_free(vnn);
1101                         return -1;
1102                 }
1103         }
1104
1105         DLIST_ADD(ctdb->vnn, vnn);
1106
1107         return 0;
1108 }
1109
1110 /*
1111   setup the event script directory
1112 */
1113 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1114 {
1115         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1116         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1117         return 0;
1118 }
1119
1120 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
1121                                   struct timeval t, void *private_data)
1122 {
1123         struct ctdb_context *ctdb = talloc_get_type(private_data, 
1124                                                         struct ctdb_context);
1125         struct ctdb_vnn *vnn;
1126
1127         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1128                 int i;
1129
1130                 for (i=0; vnn->ifaces[i] != NULL; i++) {
1131                         if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1132                                 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1133                                         vnn->ifaces[i],
1134                                         ctdb_addr_to_str(&vnn->public_address)));
1135                         }
1136                 }
1137         }
1138
1139         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1140                 timeval_current_ofs(30, 0), 
1141                 ctdb_check_interfaces_event, ctdb);
1142 }
1143
1144
1145 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1146 {
1147         if (ctdb->check_public_ifaces_ctx != NULL) {
1148                 talloc_free(ctdb->check_public_ifaces_ctx);
1149                 ctdb->check_public_ifaces_ctx = NULL;
1150         }
1151
1152         ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1153         if (ctdb->check_public_ifaces_ctx == NULL) {
1154                 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1155         }
1156
1157         event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
1158                 timeval_current_ofs(30, 0), 
1159                 ctdb_check_interfaces_event, ctdb);
1160
1161         return 0;
1162 }
1163
1164
1165 /*
1166   setup the public address lists from a file
1167 */
1168 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1169 {
1170         char **lines;
1171         int nlines;
1172         int i;
1173
1174         lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1175         if (lines == NULL) {
1176                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1177                 return -1;
1178         }
1179         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1180                 nlines--;
1181         }
1182
1183         for (i=0;i<nlines;i++) {
1184                 unsigned mask;
1185                 ctdb_sock_addr addr;
1186                 const char *addrstr;
1187                 const char *ifaces;
1188                 char *tok, *line;
1189
1190                 line = lines[i];
1191                 while ((*line == ' ') || (*line == '\t')) {
1192                         line++;
1193                 }
1194                 if (*line == '#') {
1195                         continue;
1196                 }
1197                 if (strcmp(line, "") == 0) {
1198                         continue;
1199                 }
1200                 tok = strtok(line, " \t");
1201                 addrstr = tok;
1202                 tok = strtok(NULL, " \t");
1203                 if (tok == NULL) {
1204                         if (NULL == ctdb->default_public_interface) {
1205                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1206                                          i+1));
1207                                 talloc_free(lines);
1208                                 return -1;
1209                         }
1210                         ifaces = ctdb->default_public_interface;
1211                 } else {
1212                         ifaces = tok;
1213                 }
1214
1215                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1216                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1217                         talloc_free(lines);
1218                         return -1;
1219                 }
1220                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1221                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1222                         talloc_free(lines);
1223                         return -1;
1224                 }
1225         }
1226
1227
1228         talloc_free(lines);
1229         return 0;
1230 }
1231
1232 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1233                               const char *iface,
1234                               const char *ip)
1235 {
1236         struct ctdb_vnn *svnn;
1237         struct ctdb_iface *cur = NULL;
1238         bool ok;
1239         int ret;
1240
1241         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1242         CTDB_NO_MEMORY(ctdb, svnn);
1243
1244         svnn->ifaces = talloc_array(svnn, const char *, 2);
1245         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1246         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1247         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1248         svnn->ifaces[1] = NULL;
1249
1250         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1251         if (!ok) {
1252                 talloc_free(svnn);
1253                 return -1;
1254         }
1255
1256         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1257         if (ret != 0) {
1258                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1259                                    "for single_ip[%s]\n",
1260                                    svnn->ifaces[0],
1261                                    ctdb_addr_to_str(&svnn->public_address)));
1262                 talloc_free(svnn);
1263                 return -1;
1264         }
1265
1266         /* assume the single public ip interface is initially "good" */
1267         cur = ctdb_find_iface(ctdb, iface);
1268         if (cur == NULL) {
1269                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1270                 return -1;
1271         }
1272         cur->link_up = true;
1273
1274         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1275         if (ret != 0) {
1276                 talloc_free(svnn);
1277                 return -1;
1278         }
1279
1280         ctdb->single_ip_vnn = svnn;
1281         return 0;
1282 }
1283
1284 /* Given a physical node, return the number of
1285    public addresses that is currently assigned to this node.
1286 */
1287 static int node_ip_coverage(struct ctdb_context *ctdb, 
1288         int32_t pnn,
1289         struct ctdb_public_ip_list *ips)
1290 {
1291         int num=0;
1292
1293         for (;ips;ips=ips->next) {
1294                 if (ips->pnn == pnn) {
1295                         num++;
1296                 }
1297         }
1298         return num;
1299 }
1300
1301
1302 /* Can the given node host the given IP: is the public IP known to the
1303  * node and is NOIPHOST unset?
1304 */
1305 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
1306                              struct ctdb_node_map *nodemap,
1307                              struct ctdb_public_ip_list *ip)
1308 {
1309         struct ctdb_all_public_ips *public_ips;
1310         int i;
1311
1312         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPHOST) {
1313                 return false;
1314         }
1315
1316         public_ips = ctdb->nodes[pnn]->available_public_ips;
1317
1318         if (public_ips == NULL) {
1319                 return false;
1320         }
1321
1322         for (i=0;i<public_ips->num;i++) {
1323                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1324                         /* yes, this node can serve this public ip */
1325                         return true;
1326                 }
1327         }
1328
1329         return false;
1330 }
1331
1332 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
1333                                  struct ctdb_node_map *nodemap,
1334                                  struct ctdb_public_ip_list *ip)
1335 {
1336         if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPTAKEOVER) {
1337                 return false;
1338         }
1339
1340         return can_node_host_ip(ctdb, pnn, nodemap, ip);
1341 }
1342
1343 /* search the node lists list for a node to takeover this ip.
1344    pick the node that currently are serving the least number of ips
1345    so that the ips get spread out evenly.
1346 */
1347 static int find_takeover_node(struct ctdb_context *ctdb, 
1348                 struct ctdb_node_map *nodemap,
1349                 struct ctdb_public_ip_list *ip,
1350                 struct ctdb_public_ip_list *all_ips)
1351 {
1352         int pnn, min=0, num;
1353         int i;
1354
1355         pnn    = -1;
1356         for (i=0;i<nodemap->num;i++) {
1357                 /* verify that this node can serve this ip */
1358                 if (!can_node_takeover_ip(ctdb, i, nodemap, ip)) {
1359                         /* no it couldnt   so skip to the next node */
1360                         continue;
1361                 }
1362
1363                 num = node_ip_coverage(ctdb, i, all_ips);
1364                 /* was this the first node we checked ? */
1365                 if (pnn == -1) {
1366                         pnn = i;
1367                         min  = num;
1368                 } else {
1369                         if (num < min) {
1370                                 pnn = i;
1371                                 min  = num;
1372                         }
1373                 }
1374         }       
1375         if (pnn == -1) {
1376                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1377                         ctdb_addr_to_str(&ip->addr)));
1378
1379                 return -1;
1380         }
1381
1382         ip->pnn = pnn;
1383         return 0;
1384 }
1385
1386 #define IP_KEYLEN       4
1387 static uint32_t *ip_key(ctdb_sock_addr *ip)
1388 {
1389         static uint32_t key[IP_KEYLEN];
1390
1391         bzero(key, sizeof(key));
1392
1393         switch (ip->sa.sa_family) {
1394         case AF_INET:
1395                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1396                 break;
1397         case AF_INET6: {
1398                 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1399                 key[0]  = htonl(s6_a32[0]);
1400                 key[1]  = htonl(s6_a32[1]);
1401                 key[2]  = htonl(s6_a32[2]);
1402                 key[3]  = htonl(s6_a32[3]);
1403                 break;
1404         }
1405         default:
1406                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1407                 return key;
1408         }
1409
1410         return key;
1411 }
1412
1413 static void *add_ip_callback(void *parm, void *data)
1414 {
1415         struct ctdb_public_ip_list *this_ip = parm; 
1416         struct ctdb_public_ip_list *prev_ip = data; 
1417
1418         if (prev_ip == NULL) {
1419                 return parm;
1420         }
1421         if (this_ip->pnn == -1) {
1422                 this_ip->pnn = prev_ip->pnn;
1423         }
1424
1425         return parm;
1426 }
1427
1428 static int getips_count_callback(void *param, void *data)
1429 {
1430         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1431         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1432
1433         new_ip->next = *ip_list;
1434         *ip_list     = new_ip;
1435         return 0;
1436 }
1437
1438 static struct ctdb_public_ip_list *
1439 create_merged_ip_list(struct ctdb_context *ctdb)
1440 {
1441         int i, j;
1442         struct ctdb_public_ip_list *ip_list;
1443         struct ctdb_all_public_ips *public_ips;
1444
1445         if (ctdb->ip_tree != NULL) {
1446                 talloc_free(ctdb->ip_tree);
1447                 ctdb->ip_tree = NULL;
1448         }
1449         ctdb->ip_tree = trbt_create(ctdb, 0);
1450
1451         for (i=0;i<ctdb->num_nodes;i++) {
1452                 public_ips = ctdb->nodes[i]->known_public_ips;
1453
1454                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1455                         continue;
1456                 }
1457
1458                 /* there were no public ips for this node */
1459                 if (public_ips == NULL) {
1460                         continue;
1461                 }               
1462
1463                 for (j=0;j<public_ips->num;j++) {
1464                         struct ctdb_public_ip_list *tmp_ip; 
1465
1466                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1467                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1468                         /* Do not use information about IP addresses hosted
1469                          * on other nodes, it may not be accurate */
1470                         if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1471                                 tmp_ip->pnn = public_ips->ips[j].pnn;
1472                         } else {
1473                                 tmp_ip->pnn = -1;
1474                         }
1475                         tmp_ip->addr = public_ips->ips[j].addr;
1476                         tmp_ip->next = NULL;
1477
1478                         trbt_insertarray32_callback(ctdb->ip_tree,
1479                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1480                                 add_ip_callback,
1481                                 tmp_ip);
1482                 }
1483         }
1484
1485         ip_list = NULL;
1486         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1487
1488         return ip_list;
1489 }
1490
1491 /* 
1492  * This is the length of the longtest common prefix between the IPs.
1493  * It is calculated by XOR-ing the 2 IPs together and counting the
1494  * number of leading zeroes.  The implementation means that all
1495  * addresses end up being 128 bits long.
1496  *
1497  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1498  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1499  * lots of nodes and IP addresses?
1500  */
1501 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1502 {
1503         uint32_t ip1_k[IP_KEYLEN];
1504         uint32_t *t;
1505         int i;
1506         uint32_t x;
1507
1508         uint32_t distance = 0;
1509
1510         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1511         t = ip_key(ip2);
1512         for (i=0; i<IP_KEYLEN; i++) {
1513                 x = ip1_k[i] ^ t[i];
1514                 if (x == 0) {
1515                         distance += 32;
1516                 } else {
1517                         /* Count number of leading zeroes. 
1518                          * FIXME? This could be optimised...
1519                          */
1520                         while ((x & (1 << 31)) == 0) {
1521                                 x <<= 1;
1522                                 distance += 1;
1523                         }
1524                 }
1525         }
1526
1527         return distance;
1528 }
1529
1530 /* Calculate the IP distance for the given IP relative to IPs on the
1531    given node.  The ips argument is generally the all_ips variable
1532    used in the main part of the algorithm.
1533  */
1534 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1535                                   struct ctdb_public_ip_list *ips,
1536                                   int pnn)
1537 {
1538         struct ctdb_public_ip_list *t;
1539         uint32_t d;
1540
1541         uint32_t sum = 0;
1542
1543         for (t=ips; t != NULL; t=t->next) {
1544                 if (t->pnn != pnn) {
1545                         continue;
1546                 }
1547
1548                 /* Optimisation: We never calculate the distance
1549                  * between an address and itself.  This allows us to
1550                  * calculate the effect of removing an address from a
1551                  * node by simply calculating the distance between
1552                  * that address and all of the exitsing addresses.
1553                  * Moreover, we assume that we're only ever dealing
1554                  * with addresses from all_ips so we can identify an
1555                  * address via a pointer rather than doing a more
1556                  * expensive address comparison. */
1557                 if (&(t->addr) == ip) {
1558                         continue;
1559                 }
1560
1561                 d = ip_distance(ip, &(t->addr));
1562                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1563         }
1564
1565         return sum;
1566 }
1567
1568 /* Return the LCP2 imbalance metric for addresses currently assigned
1569    to the given node.
1570  */
1571 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1572 {
1573         struct ctdb_public_ip_list *t;
1574
1575         uint32_t imbalance = 0;
1576
1577         for (t=all_ips; t!=NULL; t=t->next) {
1578                 if (t->pnn != pnn) {
1579                         continue;
1580                 }
1581                 /* Pass the rest of the IPs rather than the whole
1582                    all_ips input list.
1583                 */
1584                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1585         }
1586
1587         return imbalance;
1588 }
1589
1590 /* Allocate any unassigned IPs just by looping through the IPs and
1591  * finding the best node for each.
1592  */
1593 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1594                                       struct ctdb_node_map *nodemap,
1595                                       struct ctdb_public_ip_list *all_ips)
1596 {
1597         struct ctdb_public_ip_list *tmp_ip;
1598
1599         /* loop over all ip's and find a physical node to cover for 
1600            each unassigned ip.
1601         */
1602         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1603                 if (tmp_ip->pnn == -1) {
1604                         if (find_takeover_node(ctdb, nodemap, tmp_ip, all_ips)) {
1605                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1606                                         ctdb_addr_to_str(&tmp_ip->addr)));
1607                         }
1608                 }
1609         }
1610 }
1611
1612 /* Basic non-deterministic rebalancing algorithm.
1613  */
1614 static void basic_failback(struct ctdb_context *ctdb,
1615                            struct ctdb_node_map *nodemap,
1616                            struct ctdb_public_ip_list *all_ips,
1617                            int num_ips)
1618 {
1619         int i;
1620         int maxnode, maxnum, minnode, minnum, num, retries;
1621         struct ctdb_public_ip_list *tmp_ip;
1622
1623         retries = 0;
1624
1625 try_again:
1626         maxnum=0;
1627         minnum=0;
1628
1629         /* for each ip address, loop over all nodes that can serve
1630            this ip and make sure that the difference between the node
1631            serving the most and the node serving the least ip's are
1632            not greater than 1.
1633         */
1634         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1635                 if (tmp_ip->pnn == -1) {
1636                         continue;
1637                 }
1638
1639                 /* Get the highest and lowest number of ips's served by any 
1640                    valid node which can serve this ip.
1641                 */
1642                 maxnode = -1;
1643                 minnode = -1;
1644                 for (i=0;i<nodemap->num;i++) {
1645                         /* only check nodes that can actually serve this ip */
1646                         if (!can_node_takeover_ip(ctdb, i, nodemap, tmp_ip)) {
1647                                 /* no it couldnt   so skip to the next node */
1648                                 continue;
1649                         }
1650
1651                         num = node_ip_coverage(ctdb, i, all_ips);
1652                         if (maxnode == -1) {
1653                                 maxnode = i;
1654                                 maxnum  = num;
1655                         } else {
1656                                 if (num > maxnum) {
1657                                         maxnode = i;
1658                                         maxnum  = num;
1659                                 }
1660                         }
1661                         if (minnode == -1) {
1662                                 minnode = i;
1663                                 minnum  = num;
1664                         } else {
1665                                 if (num < minnum) {
1666                                         minnode = i;
1667                                         minnum  = num;
1668                                 }
1669                         }
1670                 }
1671                 if (maxnode == -1) {
1672                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1673                                 ctdb_addr_to_str(&tmp_ip->addr)));
1674
1675                         continue;
1676                 }
1677
1678                 /* if the spread between the smallest and largest coverage by
1679                    a node is >=2 we steal one of the ips from the node with
1680                    most coverage to even things out a bit.
1681                    try to do this a limited number of times since we dont
1682                    want to spend too much time balancing the ip coverage.
1683                 */
1684                 if ( (maxnum > minnum+1)
1685                      && (retries < (num_ips + 5)) ){
1686                         struct ctdb_public_ip_list *tmp;
1687
1688                         /* Reassign one of maxnode's VNNs */
1689                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1690                                 if (tmp->pnn == maxnode) {
1691                                         (void)find_takeover_node(ctdb, nodemap, tmp, all_ips);
1692                                         retries++;
1693                                         goto try_again;;
1694                                 }
1695                         }
1696                 }
1697         }
1698 }
1699
1700 struct ctdb_rebalancenodes {
1701         struct ctdb_rebalancenodes *next;
1702         uint32_t pnn;
1703 };
1704 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1705
1706
1707 /* set this flag to force the node to be rebalanced even if it just didnt
1708    become healthy again.
1709 */
1710 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1711 {
1712         struct ctdb_rebalancenodes *rebalance;
1713
1714         for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1715                 if (rebalance->pnn == pnn) {
1716                         return;
1717                 }
1718         }
1719
1720         rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1721         rebalance->pnn = pnn;
1722         rebalance->next = force_rebalance_list;
1723         force_rebalance_list = rebalance;
1724 }
1725
1726 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1727  * that we can unit test it.
1728  */
1729 static void lcp2_init(struct ctdb_context * tmp_ctx,
1730                struct ctdb_node_map * nodemap,
1731                uint32_t mask,
1732                struct ctdb_public_ip_list *all_ips,
1733                uint32_t **lcp2_imbalances,
1734                bool **rebalance_candidates)
1735 {
1736         int i;
1737         struct ctdb_public_ip_list *tmp_ip;
1738
1739         *rebalance_candidates = talloc_array(tmp_ctx, bool, nodemap->num);
1740         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1741         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1742         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1743
1744         for (i=0;i<nodemap->num;i++) {
1745                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1746                 /* First step: assume all nodes are candidates */
1747                 (*rebalance_candidates)[i] = true;
1748         }
1749
1750         /* 2nd step: if a node has IPs assigned then it must have been
1751          * healthy before, so we remove it from consideration.  This
1752          * is overkill but is all we have because we don't maintain
1753          * state between takeover runs.  An alternative would be to
1754          * keep state and invalidate it every time the recovery master
1755          * changes.
1756          */
1757         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1758                 if (tmp_ip->pnn != -1) {
1759                         (*rebalance_candidates)[tmp_ip->pnn] = false;
1760                 }
1761         }
1762
1763         /* 3rd step: if a node is forced to re-balance then
1764            we allow failback onto the node */
1765         while (force_rebalance_list != NULL) {
1766                 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1767
1768                 if (force_rebalance_list->pnn <= nodemap->num) {
1769                         (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1770                 }
1771
1772                 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1773                 talloc_free(force_rebalance_list);
1774                 force_rebalance_list = next;
1775         }
1776 }
1777
1778 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1779  * the IP/node combination that will cost the least.
1780  */
1781 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1782                                      struct ctdb_node_map *nodemap,
1783                                      struct ctdb_public_ip_list *all_ips,
1784                                      uint32_t *lcp2_imbalances)
1785 {
1786         struct ctdb_public_ip_list *tmp_ip;
1787         int dstnode;
1788
1789         int minnode;
1790         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1791         struct ctdb_public_ip_list *minip;
1792
1793         bool should_loop = true;
1794         bool have_unassigned = true;
1795
1796         while (have_unassigned && should_loop) {
1797                 should_loop = false;
1798
1799                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1800                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1801
1802                 minnode = -1;
1803                 mindsum = 0;
1804                 minip = NULL;
1805
1806                 /* loop over each unassigned ip. */
1807                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1808                         if (tmp_ip->pnn != -1) {
1809                                 continue;
1810                         }
1811
1812                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1813                                 /* only check nodes that can actually takeover this ip */
1814                                 if (!can_node_takeover_ip(ctdb, dstnode,
1815                                                           nodemap, tmp_ip)) {
1816                                         /* no it couldnt   so skip to the next node */
1817                                         continue;
1818                                 }
1819
1820                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1821                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1822                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1823                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1824                                                    dstnode,
1825                                                    dstimbl - lcp2_imbalances[dstnode]));
1826
1827
1828                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1829                                         minnode = dstnode;
1830                                         minimbl = dstimbl;
1831                                         mindsum = dstdsum;
1832                                         minip = tmp_ip;
1833                                         should_loop = true;
1834                                 }
1835                         }
1836                 }
1837
1838                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1839
1840                 /* If we found one then assign it to the given node. */
1841                 if (minnode != -1) {
1842                         minip->pnn = minnode;
1843                         lcp2_imbalances[minnode] = minimbl;
1844                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1845                                           ctdb_addr_to_str(&(minip->addr)),
1846                                           minnode,
1847                                           mindsum));
1848                 }
1849
1850                 /* There might be a better way but at least this is clear. */
1851                 have_unassigned = false;
1852                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1853                         if (tmp_ip->pnn == -1) {
1854                                 have_unassigned = true;
1855                         }
1856                 }
1857         }
1858
1859         /* We know if we have an unassigned addresses so we might as
1860          * well optimise.
1861          */
1862         if (have_unassigned) {
1863                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1864                         if (tmp_ip->pnn == -1) {
1865                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1866                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1867                         }
1868                 }
1869         }
1870 }
1871
1872 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1873  * to move IPs from, determines the best IP/destination node
1874  * combination to move from the source node.
1875  */
1876 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1877                                     struct ctdb_node_map *nodemap,
1878                                     struct ctdb_public_ip_list *all_ips,
1879                                     int srcnode,
1880                                     uint32_t candimbl,
1881                                     uint32_t *lcp2_imbalances,
1882                                     bool *rebalance_candidates)
1883 {
1884         int dstnode, mindstnode;
1885         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1886         uint32_t minsrcimbl, mindstimbl;
1887         struct ctdb_public_ip_list *minip;
1888         struct ctdb_public_ip_list *tmp_ip;
1889
1890         /* Find an IP and destination node that best reduces imbalance. */
1891         minip = NULL;
1892         minsrcimbl = 0;
1893         mindstnode = -1;
1894         mindstimbl = 0;
1895
1896         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1897         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1898
1899         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1900                 /* Only consider addresses on srcnode. */
1901                 if (tmp_ip->pnn != srcnode) {
1902                         continue;
1903                 }
1904
1905                 /* What is this IP address costing the source node? */
1906                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1907                 srcimbl = candimbl - srcdsum;
1908
1909                 /* Consider this IP address would cost each potential
1910                  * destination node.  Destination nodes are limited to
1911                  * those that are newly healthy, since we don't want
1912                  * to do gratuitous failover of IPs just to make minor
1913                  * balance improvements.
1914                  */
1915                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1916                         if (!rebalance_candidates[dstnode]) {
1917                                 continue;
1918                         }
1919
1920                         /* only check nodes that can actually takeover this ip */
1921                         if (!can_node_takeover_ip(ctdb, dstnode,
1922                                                   nodemap, tmp_ip)) {
1923                                 /* no it couldnt   so skip to the next node */
1924                                 continue;
1925                         }
1926
1927                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1928                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1929                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1930                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1931                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1932                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1933
1934                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1935                             ((mindstnode == -1) ||                              \
1936                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1937
1938                                 minip = tmp_ip;
1939                                 minsrcimbl = srcimbl;
1940                                 mindstnode = dstnode;
1941                                 mindstimbl = dstimbl;
1942                         }
1943                 }
1944         }
1945         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1946
1947         if (mindstnode != -1) {
1948                 /* We found a move that makes things better... */
1949                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1950                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1951                                   ctdb_addr_to_str(&(minip->addr)),
1952                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1953
1954
1955                 lcp2_imbalances[srcnode] = srcimbl;
1956                 lcp2_imbalances[mindstnode] = mindstimbl;
1957                 minip->pnn = mindstnode;
1958
1959                 return true;
1960         }
1961
1962         return false;
1963         
1964 }
1965
1966 struct lcp2_imbalance_pnn {
1967         uint32_t imbalance;
1968         int pnn;
1969 };
1970
1971 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1972 {
1973         const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1974         const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1975
1976         if (lipa->imbalance > lipb->imbalance) {
1977                 return -1;
1978         } else if (lipa->imbalance == lipb->imbalance) {
1979                 return 0;
1980         } else {
1981                 return 1;
1982         }
1983 }
1984
1985 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1986  * node with the highest LCP2 imbalance, and then determines the best
1987  * IP/destination node combination to move from the source node.
1988  */
1989 static void lcp2_failback(struct ctdb_context *ctdb,
1990                           struct ctdb_node_map *nodemap,
1991                           struct ctdb_public_ip_list *all_ips,
1992                           uint32_t *lcp2_imbalances,
1993                           bool *rebalance_candidates)
1994 {
1995         int i, num_rebalance_candidates;
1996         struct lcp2_imbalance_pnn * lips;
1997         bool again;
1998
1999 try_again:
2000
2001         /* It is only worth continuing if we have suitable target
2002          * nodes to transfer IPs to.  This check is much cheaper than
2003          * continuing on...
2004          */
2005         num_rebalance_candidates = 0;
2006         for (i = 0; i < nodemap->num; i++) {
2007                 if (rebalance_candidates[i]) {
2008                         num_rebalance_candidates++;
2009                 }
2010         }
2011         if (num_rebalance_candidates == 0) {
2012                 return;
2013         }
2014
2015         /* Put the imbalances and nodes into an array, sort them and
2016          * iterate through candidates.  Usually the 1st one will be
2017          * used, so this doesn't cost much...
2018          */
2019         lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
2020         for (i = 0; i < nodemap->num; i++) {
2021                 lips[i].imbalance = lcp2_imbalances[i];
2022                 lips[i].pnn = i;
2023         }
2024         qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
2025               lcp2_cmp_imbalance_pnn);
2026
2027         again = false;
2028         for (i = 0; i < nodemap->num; i++) {
2029                 /* This means that all nodes had 0 or 1 addresses, so
2030                  * can't be imbalanced.
2031                  */
2032                 if (lips[i].imbalance == 0) {
2033                         break;
2034                 }
2035
2036                 if (lcp2_failback_candidate(ctdb,
2037                                             nodemap,
2038                                             all_ips,
2039                                             lips[i].pnn,
2040                                             lips[i].imbalance,
2041                                             lcp2_imbalances,
2042                                             rebalance_candidates)) {
2043                         again = true;
2044                         break;
2045                 }
2046         }
2047
2048         talloc_free(lips);
2049         if (again) {
2050                 goto try_again;
2051         }
2052 }
2053
2054 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2055                                     struct ctdb_node_map *nodemap,
2056                                     struct ctdb_public_ip_list *all_ips)
2057 {
2058         struct ctdb_public_ip_list *tmp_ip;
2059
2060         /* verify that the assigned nodes can serve that public ip
2061            and set it to -1 if not
2062         */
2063         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2064                 if (tmp_ip->pnn == -1) {
2065                         continue;
2066                 }
2067                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2068                                       nodemap, tmp_ip) != 0) {
2069                         /* this node can not serve this ip. */
2070                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2071                                            ctdb_addr_to_str(&(tmp_ip->addr)),
2072                                            tmp_ip->pnn));
2073                         tmp_ip->pnn = -1;
2074                 }
2075         }
2076 }
2077
2078 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2079                                        struct ctdb_node_map *nodemap,
2080                                        struct ctdb_public_ip_list *all_ips)
2081 {
2082         struct ctdb_public_ip_list *tmp_ip;
2083         int i;
2084
2085         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2086        /* Allocate IPs to nodes in a modulo fashion so that IPs will
2087         *  always be allocated the same way for a specific set of
2088         *  available/unavailable nodes.
2089         */
2090
2091         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2092                 tmp_ip->pnn = i%nodemap->num;
2093         }
2094
2095         /* IP failback doesn't make sense with deterministic
2096          * IPs, since the modulo step above implicitly fails
2097          * back IPs to their "home" node.
2098          */
2099         if (1 == ctdb->tunable.no_ip_failback) {
2100                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2101         }
2102
2103         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2104
2105         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2106
2107         /* No failback here! */
2108 }
2109
2110 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2111                                           struct ctdb_node_map *nodemap,
2112                                           struct ctdb_public_ip_list *all_ips)
2113 {
2114         /* This should be pushed down into basic_failback. */
2115         struct ctdb_public_ip_list *tmp_ip;
2116         int num_ips = 0;
2117         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2118                 num_ips++;
2119         }
2120
2121         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2122
2123         basic_allocate_unassigned(ctdb, nodemap, all_ips);
2124
2125         /* If we don't want IPs to fail back then don't rebalance IPs. */
2126         if (1 == ctdb->tunable.no_ip_failback) {
2127                 return;
2128         }
2129
2130         /* Now, try to make sure the ip adresses are evenly distributed
2131            across the nodes.
2132         */
2133         basic_failback(ctdb, nodemap, all_ips, num_ips);
2134 }
2135
2136 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2137                           struct ctdb_node_map *nodemap,
2138                           struct ctdb_public_ip_list *all_ips,
2139                           uint32_t mask)
2140 {
2141         uint32_t *lcp2_imbalances;
2142         bool *rebalance_candidates;
2143
2144         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2145
2146         unassign_unsuitable_ips(ctdb, nodemap, all_ips);
2147
2148         lcp2_init(tmp_ctx, nodemap, mask, all_ips,
2149                   &lcp2_imbalances, &rebalance_candidates);
2150
2151         lcp2_allocate_unassigned(ctdb, nodemap, all_ips, lcp2_imbalances);
2152
2153         /* If we don't want IPs to fail back then don't rebalance IPs. */
2154         if (1 == ctdb->tunable.no_ip_failback) {
2155                 goto finished;
2156         }
2157
2158         /* Now, try to make sure the ip adresses are evenly distributed
2159            across the nodes.
2160         */
2161         lcp2_failback(ctdb, nodemap, all_ips,
2162                       lcp2_imbalances, rebalance_candidates);
2163
2164 finished:
2165         talloc_free(tmp_ctx);
2166 }
2167
2168 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2169 {
2170         int i, num_healthy;
2171
2172         /* Count how many completely healthy nodes we have */
2173         num_healthy = 0;
2174         for (i=0;i<nodemap->num;i++) {
2175                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2176                         num_healthy++;
2177                 }
2178         }
2179
2180         return num_healthy == 0;
2181 }
2182
2183 /* The calculation part of the IP allocation algorithm. */
2184 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2185                                    struct ctdb_node_map *nodemap,
2186                                    struct ctdb_public_ip_list **all_ips_p)
2187 {
2188         uint32_t mask;
2189
2190         /* If we have healthy nodes then we will only consider them
2191            for serving public addresses
2192         */
2193         mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
2194         if (all_nodes_are_disabled(nodemap) &&
2195             (ctdb->tunable.no_ip_host_on_all_disabled == 0)) {
2196                 /* We didnt have any completely healthy nodes so
2197                    use "disabled" nodes as a fallback
2198                 */
2199                 mask = NODE_FLAGS_INACTIVE;
2200         }
2201
2202         /* since nodes only know about those public addresses that
2203            can be served by that particular node, no single node has
2204            a full list of all public addresses that exist in the cluster.
2205            Walk over all node structures and create a merged list of
2206            all public addresses that exist in the cluster.
2207
2208            keep the tree of ips around as ctdb->ip_tree
2209         */
2210         *all_ips_p = create_merged_ip_list(ctdb);
2211
2212         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2213                 ip_alloc_lcp2(ctdb, nodemap, *all_ips_p, mask);
2214         } else if (1 == ctdb->tunable.deterministic_public_ips) {
2215                 ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p);
2216         } else {
2217                 ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p);
2218         }
2219
2220         /* at this point ->pnn is the node which will own each IP
2221            or -1 if there is no node that can cover this ip
2222         */
2223
2224         return;
2225 }
2226
2227 struct get_tunable_callback_data {
2228         const char *tunable;
2229         uint32_t *out;
2230 };
2231
2232 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2233                                  int32_t res, TDB_DATA outdata,
2234                                  void *callback)
2235 {
2236         struct get_tunable_callback_data *cd =
2237                 (struct get_tunable_callback_data *)callback;
2238         int size;
2239
2240         if (res != 0) {
2241                 DEBUG(DEBUG_ERR,
2242                       ("Failure to read \"%s\" tunable from remote node %d\n",
2243                        cd->tunable, pnn));
2244                 return;
2245         }
2246
2247         if (outdata.dsize != sizeof(uint32_t)) {
2248                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2249                                  cd->tunable, pnn, (int)sizeof(uint32_t),
2250                                  (int)outdata.dsize));
2251                 return;
2252         }
2253
2254         size = talloc_get_size(cd->out) / sizeof(uint32_t);
2255         if (pnn >= size) {
2256                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2257                                  cd->tunable, pnn, size));
2258                 return;
2259         }
2260
2261                 
2262         cd->out[pnn] = *(uint32_t *)outdata.dptr;
2263 }
2264
2265 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2266                                         TALLOC_CTX *tmp_ctx,
2267                                         struct ctdb_node_map *nodemap,
2268                                         const char *tunable)
2269 {
2270         TDB_DATA data;
2271         struct ctdb_control_get_tunable *t;
2272         uint32_t *nodes;
2273         uint32_t *tvals;
2274         struct get_tunable_callback_data callback_data;
2275
2276         tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
2277         CTDB_NO_MEMORY_NULL(ctdb, tvals);
2278         callback_data.out = tvals;
2279         callback_data.tunable = tunable;
2280
2281         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2282         data.dptr  = talloc_size(tmp_ctx, data.dsize);
2283         t = (struct ctdb_control_get_tunable *)data.dptr;
2284         t->length = strlen(tunable)+1;
2285         memcpy(t->name, tunable, t->length);
2286         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2287         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2288                                       nodes, 0, TAKEOVER_TIMEOUT(),
2289                                       false, data,
2290                                       get_tunable_callback, NULL,
2291                                       &callback_data) != 0) {
2292                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
2293         }
2294         talloc_free(nodes);
2295         talloc_free(data.dptr);
2296
2297         return tvals;
2298 }
2299
2300 /* Set internal flags for IP allocation:
2301  *   Clear ip flags
2302  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2303  *   Set NOIPHOST ip flag for each INACTIVE node
2304  *   if all nodes are disabled:
2305  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2306  *   else
2307  *     Set NOIPHOST ip flags for disabled nodes
2308  */
2309 static void set_ipflags_internal(struct ctdb_node_map *nodemap,
2310                                  uint32_t *tval_noiptakeover,
2311                                  uint32_t *tval_noiphostonalldisabled)
2312 {
2313         int i;
2314
2315         /* Clear IP flags */
2316         for (i=0;i<nodemap->num;i++) {
2317                 nodemap->nodes[i].flags &=
2318                         ~(NODE_FLAGS_NOIPTAKEOVER|NODE_FLAGS_NOIPHOST);
2319         }
2320
2321         for (i=0;i<nodemap->num;i++) {
2322                 /* Can not take IPs on node with NoIPTakeover set */
2323                 if (tval_noiptakeover[i] != 0) {
2324                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
2325                 }
2326
2327                 /* Can not host IPs on INACTIVE node */
2328                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2329                         nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2330                 }
2331         }
2332
2333         if (all_nodes_are_disabled(nodemap)) {
2334                 /* If all nodes are disabled, can not host IPs on node
2335                  * with NoIPHostOnAllDisabled set
2336                  */
2337                 for (i=0;i<nodemap->num;i++) {
2338                         if (tval_noiphostonalldisabled[i] != 0) {
2339                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2340                         }
2341                 }
2342         } else {
2343                 /* If some nodes are not disabled, then can not host
2344                  * IPs on DISABLED node
2345                  */
2346                 for (i=0;i<nodemap->num;i++) {
2347                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2348                                 nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
2349                         }
2350                 }
2351         }
2352 }
2353
2354 static bool set_ipflags(struct ctdb_context *ctdb,
2355                         TALLOC_CTX *tmp_ctx,
2356                         struct ctdb_node_map *nodemap)
2357 {
2358         uint32_t *tval_noiptakeover;
2359         uint32_t *tval_noiphostonalldisabled;
2360
2361         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2362                                                    "NoIPTakeover");
2363         if (tval_noiptakeover == NULL) {
2364                 return false;
2365         }
2366
2367         tval_noiphostonalldisabled =
2368                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2369                                        "NoIPHostOnAllDisabled");
2370         if (tval_noiphostonalldisabled == NULL) {
2371                 return false;
2372         }
2373
2374         set_ipflags_internal(nodemap,
2375                              tval_noiptakeover, tval_noiphostonalldisabled);
2376
2377         talloc_free(tval_noiptakeover);
2378         talloc_free(tval_noiphostonalldisabled);
2379
2380         return true;
2381 }
2382
2383 /*
2384   make any IP alias changes for public addresses that are necessary 
2385  */
2386 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2387                       client_async_callback fail_callback, void *callback_data)
2388 {
2389         int i;
2390         struct ctdb_public_ip ip;
2391         struct ctdb_public_ipv4 ipv4;
2392         uint32_t *nodes;
2393         struct ctdb_public_ip_list *all_ips, *tmp_ip;
2394         TDB_DATA data;
2395         struct timeval timeout;
2396         struct client_async_data *async_data;
2397         struct ctdb_client_control_state *state;
2398         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2399         uint32_t disable_timeout;
2400
2401         /*
2402          * ip failover is completely disabled, just send out the 
2403          * ipreallocated event.
2404          */
2405         if (ctdb->tunable.disable_ip_failover != 0) {
2406                 goto ipreallocated;
2407         }
2408
2409         if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
2410                 DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
2411                 return -1;
2412         }
2413
2414         ZERO_STRUCT(ip);
2415
2416         /* Do the IP reassignment calculations */
2417         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
2418
2419         /* The recovery daemon does regular sanity checks of the IPs.
2420          * However, sometimes it is overzealous and thinks changes are
2421          * required when they're already underway.  This stops the
2422          * checks for a while before we start moving IPs.
2423          */
2424         disable_timeout = ctdb->tunable.takeover_timeout;
2425         data.dptr  = (uint8_t*)&disable_timeout;
2426         data.dsize = sizeof(disable_timeout);
2427         if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2428                                      CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2429                 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2430         }
2431
2432         /* now tell all nodes to delete any alias that they should not
2433            have.  This will be a NOOP on nodes that don't currently
2434            hold the given alias */
2435         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2436         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2437
2438         async_data->fail_callback = fail_callback;
2439         async_data->callback_data = callback_data;
2440
2441         for (i=0;i<nodemap->num;i++) {
2442                 /* don't talk to unconnected nodes, but do talk to banned nodes */
2443                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2444                         continue;
2445                 }
2446
2447                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2448                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2449                                 /* This node should be serving this
2450                                    vnn so dont tell it to release the ip
2451                                 */
2452                                 continue;
2453                         }
2454                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
2455                                 ipv4.pnn = tmp_ip->pnn;
2456                                 ipv4.sin = tmp_ip->addr.ip;
2457
2458                                 timeout = TAKEOVER_TIMEOUT();
2459                                 data.dsize = sizeof(ipv4);
2460                                 data.dptr  = (uint8_t *)&ipv4;
2461                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2462                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2463                                                 data, async_data,
2464                                                 &timeout, NULL);
2465                         } else {
2466                                 ip.pnn  = tmp_ip->pnn;
2467                                 ip.addr = tmp_ip->addr;
2468
2469                                 timeout = TAKEOVER_TIMEOUT();
2470                                 data.dsize = sizeof(ip);
2471                                 data.dptr  = (uint8_t *)&ip;
2472                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2473                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
2474                                                 data, async_data,
2475                                                 &timeout, NULL);
2476                         }
2477
2478                         if (state == NULL) {
2479                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2480                                 talloc_free(tmp_ctx);
2481                                 return -1;
2482                         }
2483                 
2484                         ctdb_client_async_add(async_data, state);
2485                 }
2486         }
2487         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2488                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2489                 talloc_free(tmp_ctx);
2490                 return -1;
2491         }
2492         talloc_free(async_data);
2493
2494
2495         /* tell all nodes to get their own IPs */
2496         async_data = talloc_zero(tmp_ctx, struct client_async_data);
2497         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2498
2499         async_data->fail_callback = fail_callback;
2500         async_data->callback_data = callback_data;
2501
2502         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2503                 if (tmp_ip->pnn == -1) {
2504                         /* this IP won't be taken over */
2505                         continue;
2506                 }
2507
2508                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2509                         ipv4.pnn = tmp_ip->pnn;
2510                         ipv4.sin = tmp_ip->addr.ip;
2511
2512                         timeout = TAKEOVER_TIMEOUT();
2513                         data.dsize = sizeof(ipv4);
2514                         data.dptr  = (uint8_t *)&ipv4;
2515                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2516                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2517                                         data, async_data,
2518                                         &timeout, NULL);
2519                 } else {
2520                         ip.pnn  = tmp_ip->pnn;
2521                         ip.addr = tmp_ip->addr;
2522
2523                         timeout = TAKEOVER_TIMEOUT();
2524                         data.dsize = sizeof(ip);
2525                         data.dptr  = (uint8_t *)&ip;
2526                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2527                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2528                                         data, async_data,
2529                                         &timeout, NULL);
2530                 }
2531                 if (state == NULL) {
2532                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2533                         talloc_free(tmp_ctx);
2534                         return -1;
2535                 }
2536                 
2537                 ctdb_client_async_add(async_data, state);
2538         }
2539         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2540                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2541                 talloc_free(tmp_ctx);
2542                 return -1;
2543         }
2544
2545 ipreallocated:
2546         /* 
2547          * Tell all nodes to run eventscripts to process the
2548          * "ipreallocated" event.  This can do a lot of things,
2549          * including restarting services to reconfigure them if public
2550          * IPs have moved.  Once upon a time this event only used to
2551          * update natwg.
2552          */
2553         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2554         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2555                                       nodes, 0, TAKEOVER_TIMEOUT(),
2556                                       false, tdb_null,
2557                                       NULL, fail_callback,
2558                                       callback_data) != 0) {
2559                 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2560         }
2561
2562         talloc_free(tmp_ctx);
2563         return 0;
2564 }
2565
2566
2567 /*
2568   destroy a ctdb_client_ip structure
2569  */
2570 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2571 {
2572         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2573                 ctdb_addr_to_str(&ip->addr),
2574                 ntohs(ip->addr.ip.sin_port),
2575                 ip->client_id));
2576
2577         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2578         return 0;
2579 }
2580
2581 /*
2582   called by a client to inform us of a TCP connection that it is managing
2583   that should tickled with an ACK when IP takeover is done
2584   we handle both the old ipv4 style of packets as well as the new ipv4/6
2585   pdus.
2586  */
2587 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2588                                 TDB_DATA indata)
2589 {
2590         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2591         struct ctdb_control_tcp *old_addr = NULL;
2592         struct ctdb_control_tcp_addr new_addr;
2593         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2594         struct ctdb_tcp_list *tcp;
2595         struct ctdb_tcp_connection t;
2596         int ret;
2597         TDB_DATA data;
2598         struct ctdb_client_ip *ip;
2599         struct ctdb_vnn *vnn;
2600         ctdb_sock_addr addr;
2601
2602         switch (indata.dsize) {
2603         case sizeof(struct ctdb_control_tcp):
2604                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2605                 ZERO_STRUCT(new_addr);
2606                 tcp_sock = &new_addr;
2607                 tcp_sock->src.ip  = old_addr->src;
2608                 tcp_sock->dest.ip = old_addr->dest;
2609                 break;
2610         case sizeof(struct ctdb_control_tcp_addr):
2611                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2612                 break;
2613         default:
2614                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2615                                  "to ctdb_control_tcp_client. size was %d but "
2616                                  "only allowed sizes are %lu and %lu\n",
2617                                  (int)indata.dsize,
2618                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2619                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2620                 return -1;
2621         }
2622
2623         addr = tcp_sock->src;
2624         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2625         addr = tcp_sock->dest;
2626         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2627
2628         ZERO_STRUCT(addr);
2629         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2630         vnn = find_public_ip_vnn(ctdb, &addr);
2631         if (vnn == NULL) {
2632                 switch (addr.sa.sa_family) {
2633                 case AF_INET:
2634                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2635                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2636                                         ctdb_addr_to_str(&addr)));
2637                         }
2638                         break;
2639                 case AF_INET6:
2640                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2641                                 ctdb_addr_to_str(&addr)));
2642                         break;
2643                 default:
2644                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2645                 }
2646
2647                 return 0;
2648         }
2649
2650         if (vnn->pnn != ctdb->pnn) {
2651                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2652                         ctdb_addr_to_str(&addr),
2653                         client_id, client->pid));
2654                 /* failing this call will tell smbd to die */
2655                 return -1;
2656         }
2657
2658         ip = talloc(client, struct ctdb_client_ip);
2659         CTDB_NO_MEMORY(ctdb, ip);
2660
2661         ip->ctdb      = ctdb;
2662         ip->addr      = addr;
2663         ip->client_id = client_id;
2664         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2665         DLIST_ADD(ctdb->client_ip_list, ip);
2666
2667         tcp = talloc(client, struct ctdb_tcp_list);
2668         CTDB_NO_MEMORY(ctdb, tcp);
2669
2670         tcp->connection.src_addr = tcp_sock->src;
2671         tcp->connection.dst_addr = tcp_sock->dest;
2672
2673         DLIST_ADD(client->tcp_list, tcp);
2674
2675         t.src_addr = tcp_sock->src;
2676         t.dst_addr = tcp_sock->dest;
2677
2678         data.dptr = (uint8_t *)&t;
2679         data.dsize = sizeof(t);
2680
2681         switch (addr.sa.sa_family) {
2682         case AF_INET:
2683                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2684                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2685                         ctdb_addr_to_str(&tcp_sock->src),
2686                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2687                 break;
2688         case AF_INET6:
2689                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2690                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2691                         ctdb_addr_to_str(&tcp_sock->src),
2692                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2693                 break;
2694         default:
2695                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2696         }
2697
2698
2699         /* tell all nodes about this tcp connection */
2700         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2701                                        CTDB_CONTROL_TCP_ADD,
2702                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2703         if (ret != 0) {
2704                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2705                 return -1;
2706         }
2707
2708         return 0;
2709 }
2710
2711 /*
2712   find a tcp address on a list
2713  */
2714 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2715                                            struct ctdb_tcp_connection *tcp)
2716 {
2717         int i;
2718
2719         if (array == NULL) {
2720                 return NULL;
2721         }
2722
2723         for (i=0;i<array->num;i++) {
2724                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2725                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2726                         return &array->connections[i];
2727                 }
2728         }
2729         return NULL;
2730 }
2731
2732
2733
2734 /*
2735   called by a daemon to inform us of a TCP connection that one of its
2736   clients managing that should tickled with an ACK when IP takeover is
2737   done
2738  */
2739 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2740 {
2741         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2742         struct ctdb_tcp_array *tcparray;
2743         struct ctdb_tcp_connection tcp;
2744         struct ctdb_vnn *vnn;
2745
2746         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2747         if (vnn == NULL) {
2748                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2749                         ctdb_addr_to_str(&p->dst_addr)));
2750
2751                 return -1;
2752         }
2753
2754
2755         tcparray = vnn->tcp_array;
2756
2757         /* If this is the first tickle */
2758         if (tcparray == NULL) {
2759                 tcparray = talloc_size(ctdb->nodes, 
2760                         offsetof(struct ctdb_tcp_array, connections) +
2761                         sizeof(struct ctdb_tcp_connection) * 1);
2762                 CTDB_NO_MEMORY(ctdb, tcparray);
2763                 vnn->tcp_array = tcparray;
2764
2765                 tcparray->num = 0;
2766                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2767                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2768
2769                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2770                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2771                 tcparray->num++;
2772
2773                 if (tcp_update_needed) {
2774                         vnn->tcp_update_needed = true;
2775                 }
2776                 return 0;
2777         }
2778
2779
2780         /* Do we already have this tickle ?*/
2781         tcp.src_addr = p->src_addr;
2782         tcp.dst_addr = p->dst_addr;
2783         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2784                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2785                         ctdb_addr_to_str(&tcp.dst_addr),
2786                         ntohs(tcp.dst_addr.ip.sin_port),
2787                         vnn->pnn));
2788                 return 0;
2789         }
2790
2791         /* A new tickle, we must add it to the array */
2792         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2793                                         struct ctdb_tcp_connection,
2794                                         tcparray->num+1);
2795         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2796
2797         vnn->tcp_array = tcparray;
2798         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2799         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2800         tcparray->num++;
2801                                 
2802         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2803                 ctdb_addr_to_str(&tcp.dst_addr),
2804                 ntohs(tcp.dst_addr.ip.sin_port),
2805                 vnn->pnn));
2806
2807         if (tcp_update_needed) {
2808                 vnn->tcp_update_needed = true;
2809         }
2810
2811         return 0;
2812 }
2813
2814
2815 /*
2816   called by a daemon to inform us of a TCP connection that one of its
2817   clients managing that should tickled with an ACK when IP takeover is
2818   done
2819  */
2820 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2821 {
2822         struct ctdb_tcp_connection *tcpp;
2823         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2824
2825         if (vnn == NULL) {
2826                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2827                         ctdb_addr_to_str(&conn->dst_addr)));
2828                 return;
2829         }
2830
2831         /* if the array is empty we cant remove it
2832            and we dont need to do anything
2833          */
2834         if (vnn->tcp_array == NULL) {
2835                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2836                         ctdb_addr_to_str(&conn->dst_addr),
2837                         ntohs(conn->dst_addr.ip.sin_port)));
2838                 return;
2839         }
2840
2841
2842         /* See if we know this connection
2843            if we dont know this connection  then we dont need to do anything
2844          */
2845         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2846         if (tcpp == NULL) {
2847                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2848                         ctdb_addr_to_str(&conn->dst_addr),
2849                         ntohs(conn->dst_addr.ip.sin_port)));
2850                 return;
2851         }
2852
2853
2854         /* We need to remove this entry from the array.
2855            Instead of allocating a new array and copying data to it
2856            we cheat and just copy the last entry in the existing array
2857            to the entry that is to be removed and just shring the 
2858            ->num field
2859          */
2860         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2861         vnn->tcp_array->num--;
2862
2863         /* If we deleted the last entry we also need to remove the entire array
2864          */
2865         if (vnn->tcp_array->num == 0) {
2866                 talloc_free(vnn->tcp_array);
2867                 vnn->tcp_array = NULL;
2868         }               
2869
2870         vnn->tcp_update_needed = true;
2871
2872         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2873                 ctdb_addr_to_str(&conn->src_addr),
2874                 ntohs(conn->src_addr.ip.sin_port)));
2875 }
2876
2877
2878 /*
2879   called by a daemon to inform us of a TCP connection that one of its
2880   clients used are no longer needed in the tickle database
2881  */
2882 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2883 {
2884         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2885
2886         ctdb_remove_tcp_connection(ctdb, conn);
2887
2888         return 0;
2889 }
2890
2891
2892 /*
2893   called when a daemon restarts - send all tickes for all public addresses
2894   we are serving immediately to the new node.
2895  */
2896 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2897 {
2898 /*XXX here we should send all tickes we are serving to the new node */
2899         return 0;
2900 }
2901
2902
2903 /*
2904   called when a client structure goes away - hook to remove
2905   elements from the tcp_list in all daemons
2906  */
2907 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2908 {
2909         while (client->tcp_list) {
2910                 struct ctdb_tcp_list *tcp = client->tcp_list;
2911                 DLIST_REMOVE(client->tcp_list, tcp);
2912                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2913         }
2914 }
2915
2916
2917 /*
2918   release all IPs on shutdown
2919  */
2920 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2921 {
2922         struct ctdb_vnn *vnn;
2923
2924         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2925                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2926                         ctdb_vnn_unassign_iface(ctdb, vnn);
2927                         continue;
2928                 }
2929                 if (!vnn->iface) {
2930                         continue;
2931                 }
2932                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2933                                   ctdb_vnn_iface_string(vnn),
2934                                   ctdb_addr_to_str(&vnn->public_address),
2935                                   vnn->public_netmask_bits);
2936                 release_kill_clients(ctdb, &vnn->public_address);
2937                 ctdb_vnn_unassign_iface(ctdb, vnn);
2938         }
2939 }
2940
2941
2942 /*
2943   get list of public IPs
2944  */
2945 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2946                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2947 {
2948         int i, num, len;
2949         struct ctdb_all_public_ips *ips;
2950         struct ctdb_vnn *vnn;
2951         bool only_available = false;
2952
2953         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2954                 only_available = true;
2955         }
2956
2957         /* count how many public ip structures we have */
2958         num = 0;
2959         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2960                 num++;
2961         }
2962
2963         len = offsetof(struct ctdb_all_public_ips, ips) + 
2964                 num*sizeof(struct ctdb_public_ip);
2965         ips = talloc_zero_size(outdata, len);
2966         CTDB_NO_MEMORY(ctdb, ips);
2967
2968         i = 0;
2969         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2970                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2971                         continue;
2972                 }
2973                 ips->ips[i].pnn  = vnn->pnn;
2974                 ips->ips[i].addr = vnn->public_address;
2975                 i++;
2976         }
2977         ips->num = i;
2978         len = offsetof(struct ctdb_all_public_ips, ips) +
2979                 i*sizeof(struct ctdb_public_ip);
2980
2981         outdata->dsize = len;
2982         outdata->dptr  = (uint8_t *)ips;
2983
2984         return 0;
2985 }
2986
2987
2988 /*
2989   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2990  */
2991 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2992                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2993 {
2994         int i, num, len;
2995         struct ctdb_all_public_ipsv4 *ips;
2996         struct ctdb_vnn *vnn;
2997
2998         /* count how many public ip structures we have */
2999         num = 0;
3000         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3001                 if (vnn->public_address.sa.sa_family != AF_INET) {
3002                         continue;
3003                 }
3004                 num++;
3005         }
3006
3007         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
3008                 num*sizeof(struct ctdb_public_ipv4);
3009         ips = talloc_zero_size(outdata, len);
3010         CTDB_NO_MEMORY(ctdb, ips);
3011
3012         outdata->dsize = len;
3013         outdata->dptr  = (uint8_t *)ips;
3014
3015         ips->num = num;
3016         i = 0;
3017         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3018                 if (vnn->public_address.sa.sa_family != AF_INET) {
3019                         continue;
3020                 }
3021                 ips->ips[i].pnn = vnn->pnn;
3022                 ips->ips[i].sin = vnn->public_address.ip;
3023                 i++;
3024         }
3025
3026         return 0;
3027 }
3028
3029 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3030                                         struct ctdb_req_control *c,
3031                                         TDB_DATA indata,
3032                                         TDB_DATA *outdata)
3033 {
3034         int i, num, len;
3035         ctdb_sock_addr *addr;
3036         struct ctdb_control_public_ip_info *info;
3037         struct ctdb_vnn *vnn;
3038
3039         addr = (ctdb_sock_addr *)indata.dptr;
3040
3041         vnn = find_public_ip_vnn(ctdb, addr);
3042         if (vnn == NULL) {
3043                 /* if it is not a public ip   it could be our 'single ip' */
3044                 if (ctdb->single_ip_vnn) {
3045                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3046                                 vnn = ctdb->single_ip_vnn;
3047                         }
3048                 }
3049         }
3050         if (vnn == NULL) {
3051                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3052                                  "'%s'not a public address\n",
3053                                  ctdb_addr_to_str(addr)));
3054                 return -1;
3055         }
3056
3057         /* count how many public ip structures we have */
3058         num = 0;
3059         for (;vnn->ifaces[num];) {
3060                 num++;
3061         }
3062
3063         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3064                 num*sizeof(struct ctdb_control_iface_info);
3065         info = talloc_zero_size(outdata, len);
3066         CTDB_NO_MEMORY(ctdb, info);
3067
3068         info->ip.addr = vnn->public_address;
3069         info->ip.pnn = vnn->pnn;
3070         info->active_idx = 0xFFFFFFFF;
3071
3072         for (i=0; vnn->ifaces[i]; i++) {
3073                 struct ctdb_iface *cur;
3074
3075                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3076                 if (cur == NULL) {
3077                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3078                                            vnn->ifaces[i]));
3079                         return -1;
3080                 }
3081                 if (vnn->iface == cur) {
3082                         info->active_idx = i;
3083                 }
3084                 strcpy(info->ifaces[i].name, cur->name);
3085                 info->ifaces[i].link_state = cur->link_up;
3086                 info->ifaces[i].references = cur->references;
3087         }
3088         info->num = i;
3089         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3090                 i*sizeof(struct ctdb_control_iface_info);
3091
3092         outdata->dsize = len;
3093         outdata->dptr  = (uint8_t *)info;
3094
3095         return 0;
3096 }
3097
3098 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3099                                 struct ctdb_req_control *c,
3100                                 TDB_DATA *outdata)
3101 {
3102         int i, num, len;
3103         struct ctdb_control_get_ifaces *ifaces;
3104         struct ctdb_iface *cur;
3105
3106         /* count how many public ip structures we have */
3107         num = 0;
3108         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3109                 num++;
3110         }
3111
3112         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3113                 num*sizeof(struct ctdb_control_iface_info);
3114         ifaces = talloc_zero_size(outdata, len);
3115         CTDB_NO_MEMORY(ctdb, ifaces);
3116
3117         i = 0;
3118         for (cur=ctdb->ifaces;cur;cur=cur->next) {
3119                 strcpy(ifaces->ifaces[i].name, cur->name);
3120                 ifaces->ifaces[i].link_state = cur->link_up;
3121                 ifaces->ifaces[i].references = cur->references;
3122                 i++;
3123         }
3124         ifaces->num = i;
3125         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3126                 i*sizeof(struct ctdb_control_iface_info);
3127
3128         outdata->dsize = len;
3129         outdata->dptr  = (uint8_t *)ifaces;
3130
3131         return 0;
3132 }
3133
3134 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3135                                     struct ctdb_req_control *c,
3136                                     TDB_DATA indata)
3137 {
3138         struct ctdb_control_iface_info *info;
3139         struct ctdb_iface *iface;
3140         bool link_up = false;
3141
3142         info = (struct ctdb_control_iface_info *)indata.dptr;
3143
3144         if (info->name[CTDB_IFACE_SIZE] != '\0') {
3145                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3146                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3147                                   len, len, info->name));
3148                 return -1;
3149         }
3150
3151         switch (info->link_state) {
3152         case 0:
3153                 link_up = false;
3154                 break;
3155         case 1:
3156                 link_up = true;
3157                 break;
3158         default:
3159                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3160                                   (unsigned int)info->link_state));
3161                 return -1;
3162         }
3163
3164         if (info->references != 0) {
3165                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3166                                   (unsigned int)info->references));
3167                 return -1;
3168         }
3169
3170         iface = ctdb_find_iface(ctdb, info->name);
3171         if (iface == NULL) {
3172                 return -1;
3173         }
3174
3175         if (link_up == iface->link_up) {
3176                 return 0;
3177         }
3178
3179         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3180               ("iface[%s] has changed it's link status %s => %s\n",
3181                iface->name,
3182                iface->link_up?"up":"down",
3183                link_up?"up":"down"));
3184
3185         iface->link_up = link_up;
3186         return 0;
3187 }
3188
3189
3190 /* 
3191    structure containing the listening socket and the list of tcp connections
3192    that the ctdb daemon is to kill
3193 */
3194 struct ctdb_kill_tcp {
3195         struct ctdb_vnn *vnn;
3196         struct ctdb_context *ctdb;
3197         int capture_fd;
3198         struct fd_event *fde;
3199         trbt_tree_t *connections;
3200         void *private_data;
3201 };
3202
3203 /*
3204   a tcp connection that is to be killed
3205  */
3206 struct ctdb_killtcp_con {
3207         ctdb_sock_addr src_addr;
3208         ctdb_sock_addr dst_addr;
3209         int count;
3210         struct ctdb_kill_tcp *killtcp;
3211 };
3212
3213 /* this function is used to create a key to represent this socketpair
3214    in the killtcp tree.
3215    this key is used to insert and lookup matching socketpairs that are
3216    to be tickled and RST
3217 */
3218 #define KILLTCP_KEYLEN  10
3219 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3220 {
3221         static uint32_t key[KILLTCP_KEYLEN];
3222
3223         bzero(key, sizeof(key));
3224
3225         if (src->sa.sa_family != dst->sa.sa_family) {
3226                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3227                 return key;
3228         }
3229         
3230         switch (src->sa.sa_family) {
3231         case AF_INET:
3232                 key[0]  = dst->ip.sin_addr.s_addr;
3233                 key[1]  = src->ip.sin_addr.s_addr;
3234                 key[2]  = dst->ip.sin_port;
3235                 key[3]  = src->ip.sin_port;
3236                 break;
3237         case AF_INET6: {
3238                 uint32_t *dst6_addr32 =
3239                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3240                 uint32_t *src6_addr32 =
3241                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3242                 key[0]  = dst6_addr32[3];
3243                 key[1]  = src6_addr32[3];
3244                 key[2]  = dst6_addr32[2];
3245                 key[3]  = src6_addr32[2];
3246                 key[4]  = dst6_addr32[1];
3247                 key[5]  = src6_addr32[1];
3248                 key[6]  = dst6_addr32[0];
3249                 key[7]  = src6_addr32[0];
3250                 key[8]  = dst->ip6.sin6_port;
3251                 key[9]  = src->ip6.sin6_port;
3252                 break;
3253         }
3254         default:
3255                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3256                 return key;
3257         }
3258
3259         return key;
3260 }
3261
3262 /*
3263   called when we get a read event on the raw socket
3264  */
3265 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
3266                                 uint16_t flags, void *private_data)
3267 {
3268         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3269         struct ctdb_killtcp_con *con;
3270         ctdb_sock_addr src, dst;
3271         uint32_t ack_seq, seq;
3272
3273         if (!(flags & EVENT_FD_READ)) {
3274                 return;
3275         }
3276
3277         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3278                                 killtcp->private_data,
3279                                 &src, &dst,
3280                                 &ack_seq, &seq) != 0) {
3281                 /* probably a non-tcp ACK packet */
3282                 return;
3283         }
3284
3285         /* check if we have this guy in our list of connections
3286            to kill
3287         */
3288         con = trbt_lookuparray32(killtcp->connections, 
3289                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3290         if (con == NULL) {
3291                 /* no this was some other packet we can just ignore */
3292                 return;
3293         }
3294
3295         /* This one has been tickled !
3296            now reset him and remove him from the list.
3297          */
3298         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3299                 ntohs(con->dst_addr.ip.sin_port),
3300                 ctdb_addr_to_str(&con->src_addr),
3301                 ntohs(con->src_addr.ip.sin_port)));
3302
3303         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3304         talloc_free(con);
3305 }
3306
3307
3308 /* when traversing the list of all tcp connections to send tickle acks to
3309    (so that we can capture the ack coming back and kill the connection
3310     by a RST)
3311    this callback is called for each connection we are currently trying to kill
3312 */
3313 static int tickle_connection_traverse(void *param, void *data)
3314 {
3315         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3316
3317         /* have tried too many times, just give up */
3318         if (con->count >= 5) {
3319                 /* can't delete in traverse: reparent to delete_cons */
3320                 talloc_steal(param, con);
3321                 return 0;
3322         }
3323
3324         /* othervise, try tickling it again */
3325         con->count++;
3326         ctdb_sys_send_tcp(
3327                 (ctdb_sock_addr *)&con->dst_addr,
3328                 (ctdb_sock_addr *)&con->src_addr,
3329                 0, 0, 0);
3330         return 0;
3331 }
3332
3333
3334 /* 
3335    called every second until all sentenced connections have been reset
3336  */
3337 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
3338                                               struct timeval t, void *private_data)
3339 {
3340         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3341         void *delete_cons = talloc_new(NULL);
3342
3343         /* loop over all connections sending tickle ACKs */
3344         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3345
3346         /* now we've finished traverse, it's safe to do deletion. */
3347         talloc_free(delete_cons);
3348
3349         /* If there are no more connections to kill we can remove the
3350            entire killtcp structure
3351          */
3352         if ( (killtcp->connections == NULL) || 
3353              (killtcp->connections->root == NULL) ) {
3354                 talloc_free(killtcp);
3355                 return;
3356         }
3357
3358         /* try tickling them again in a seconds time
3359          */
3360         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3361                         ctdb_tickle_sentenced_connections, killtcp);
3362 }
3363
3364 /*
3365   destroy the killtcp structure
3366  */
3367 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3368 {
3369         struct ctdb_vnn *tmpvnn;
3370
3371         /* verify that this vnn is still active */
3372         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3373                 if (tmpvnn == killtcp->vnn) {
3374                         break;
3375                 }
3376         }
3377
3378         if (tmpvnn == NULL) {
3379                 return 0;
3380         }
3381
3382         if (killtcp->vnn->killtcp != killtcp) {
3383                 return 0;
3384         }
3385
3386         killtcp->vnn->killtcp = NULL;
3387
3388         return 0;
3389 }
3390
3391
3392 /* nothing fancy here, just unconditionally replace any existing
3393    connection structure with the new one.
3394
3395    dont even free the old one if it did exist, that one is talloc_stolen
3396    by the same node in the tree anyway and will be deleted when the new data 
3397    is deleted
3398 */
3399 static void *add_killtcp_callback(void *parm, void *data)
3400 {
3401         return parm;
3402 }
3403
3404 /*
3405   add a tcp socket to the list of connections we want to RST
3406  */
3407 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
3408                                        ctdb_sock_addr *s,
3409                                        ctdb_sock_addr *d)
3410 {
3411         ctdb_sock_addr src, dst;
3412         struct ctdb_kill_tcp *killtcp;
3413         struct ctdb_killtcp_con *con;
3414         struct ctdb_vnn *vnn;
3415
3416         ctdb_canonicalize_ip(s, &src);
3417         ctdb_canonicalize_ip(d, &dst);
3418
3419         vnn = find_public_ip_vnn(ctdb, &dst);
3420         if (vnn == NULL) {
3421                 vnn = find_public_ip_vnn(ctdb, &src);
3422         }
3423         if (vnn == NULL) {
3424                 /* if it is not a public ip   it could be our 'single ip' */
3425                 if (ctdb->single_ip_vnn) {
3426                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3427                                 vnn = ctdb->single_ip_vnn;
3428                         }
3429                 }
3430         }
3431         if (vnn == NULL) {
3432                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
3433                 return -1;
3434         }
3435
3436         killtcp = vnn->killtcp;
3437         
3438         /* If this is the first connection to kill we must allocate
3439            a new structure
3440          */
3441         if (killtcp == NULL) {
3442                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3443                 CTDB_NO_MEMORY(ctdb, killtcp);
3444
3445                 killtcp->vnn         = vnn;
3446                 killtcp->ctdb        = ctdb;
3447                 killtcp->capture_fd  = -1;
3448                 killtcp->connections = trbt_create(killtcp, 0);
3449
3450                 vnn->killtcp         = killtcp;
3451                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3452         }
3453
3454
3455
3456         /* create a structure that describes this connection we want to
3457            RST and store it in killtcp->connections
3458         */
3459         con = talloc(killtcp, struct ctdb_killtcp_con);
3460         CTDB_NO_MEMORY(ctdb, con);
3461         con->src_addr = src;
3462         con->dst_addr = dst;
3463         con->count    = 0;
3464         con->killtcp  = killtcp;
3465
3466
3467         trbt_insertarray32_callback(killtcp->connections,
3468                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3469                         add_killtcp_callback, con);
3470
3471         /* 
3472            If we dont have a socket to listen on yet we must create it
3473          */
3474         if (killtcp->capture_fd == -1) {
3475                 const char *iface = ctdb_vnn_iface_string(vnn);
3476                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3477                 if (killtcp->capture_fd == -1) {
3478                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3479                                           "socket on iface '%s' for killtcp (%s)\n",
3480                                           iface, strerror(errno)));
3481                         goto failed;
3482                 }
3483         }
3484
3485
3486         if (killtcp->fde == NULL) {
3487                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
3488                                             EVENT_FD_READ,
3489                                             capture_tcp_handler, killtcp);
3490                 tevent_fd_set_auto_close(killtcp->fde);
3491
3492                 /* We also need to set up some events to tickle all these connections
3493                    until they are all reset
3494                 */
3495                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
3496                                 ctdb_tickle_sentenced_connections, killtcp);
3497         }
3498
3499         /* tickle him once now */
3500         ctdb_sys_send_tcp(
3501                 &con->dst_addr,
3502                 &con->src_addr,
3503                 0, 0, 0);
3504
3505         return 0;
3506
3507 failed:
3508         talloc_free(vnn->killtcp);
3509         vnn->killtcp = NULL;
3510         return -1;
3511 }
3512
3513 /*
3514   kill a TCP connection.
3515  */
3516 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3517 {
3518         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3519
3520         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3521 }
3522
3523 /*
3524   called by a daemon to inform us of the entire list of TCP tickles for
3525   a particular public address.
3526   this control should only be sent by the node that is currently serving
3527   that public address.
3528  */
3529 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3530 {
3531         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3532         struct ctdb_tcp_array *tcparray;
3533         struct ctdb_vnn *vnn;
3534
3535         /* We must at least have tickles.num or else we cant verify the size
3536            of the received data blob
3537          */
3538         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3539                                         tickles.connections)) {
3540                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3541                 return -1;
3542         }
3543
3544         /* verify that the size of data matches what we expect */
3545         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3546                                 tickles.connections)
3547                          + sizeof(struct ctdb_tcp_connection)
3548                                  * list->tickles.num) {
3549                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3550                 return -1;
3551         }       
3552
3553         vnn = find_public_ip_vnn(ctdb, &list->addr);
3554         if (vnn == NULL) {
3555                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3556                         ctdb_addr_to_str(&list->addr)));
3557
3558                 return 1;
3559         }
3560
3561         /* remove any old ticklelist we might have */
3562         talloc_free(vnn->tcp_array);
3563         vnn->tcp_array = NULL;
3564
3565         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3566         CTDB_NO_MEMORY(ctdb, tcparray);
3567
3568         tcparray->num = list->tickles.num;
3569
3570         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3571         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3572
3573         memcpy(tcparray->connections, &list->tickles.connections[0], 
3574                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3575
3576         /* We now have a new fresh tickle list array for this vnn */
3577         vnn->tcp_array = talloc_steal(vnn, tcparray);
3578         
3579         return 0;
3580 }
3581
3582 /*
3583   called to return the full list of tickles for the puclic address associated 
3584   with the provided vnn
3585  */
3586 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3587 {
3588         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3589         struct ctdb_control_tcp_tickle_list *list;
3590         struct ctdb_tcp_array *tcparray;
3591         int num;
3592         struct ctdb_vnn *vnn;
3593
3594         vnn = find_public_ip_vnn(ctdb, addr);
3595         if (vnn == NULL) {
3596                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3597                         ctdb_addr_to_str(addr)));
3598
3599                 return 1;
3600         }
3601
3602         tcparray = vnn->tcp_array;
3603         if (tcparray) {
3604                 num = tcparray->num;
3605         } else {
3606                 num = 0;
3607         }
3608
3609         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3610                                 tickles.connections)
3611                         + sizeof(struct ctdb_tcp_connection) * num;
3612
3613         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3614         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3615         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3616
3617         list->addr = *addr;
3618         list->tickles.num = num;
3619         if (num) {
3620                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3621                         sizeof(struct ctdb_tcp_connection) * num);
3622         }
3623
3624         return 0;
3625 }
3626
3627
3628 /*
3629   set the list of all tcp tickles for a public address
3630  */
3631 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3632                               struct timeval timeout, uint32_t destnode, 
3633                               ctdb_sock_addr *addr,
3634                               struct ctdb_tcp_array *tcparray)
3635 {
3636         int ret, num;
3637         TDB_DATA data;
3638         struct ctdb_control_tcp_tickle_list *list;
3639
3640         if (tcparray) {
3641                 num = tcparray->num;
3642         } else {
3643                 num = 0;
3644         }
3645
3646         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3647                                 tickles.connections) +
3648                         sizeof(struct ctdb_tcp_connection) * num;
3649         data.dptr = talloc_size(ctdb, data.dsize);
3650         CTDB_NO_MEMORY(ctdb, data.dptr);
3651
3652         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3653         list->addr = *addr;
3654         list->tickles.num = num;
3655         if (tcparray) {
3656                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3657         }
3658
3659         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3660                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3661                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3662         if (ret != 0) {
3663                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3664                 return -1;
3665         }
3666
3667         talloc_free(data.dptr);
3668
3669         return ret;
3670 }
3671
3672
3673 /*
3674   perform tickle updates if required
3675  */
3676 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3677                                 struct timed_event *te, 
3678                                 struct timeval t, void *private_data)
3679 {
3680         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3681         int ret;
3682         struct ctdb_vnn *vnn;
3683
3684         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3685                 /* we only send out updates for public addresses that 
3686                    we have taken over
3687                  */
3688                 if (ctdb->pnn != vnn->pnn) {
3689                         continue;
3690                 }
3691                 /* We only send out the updates if we need to */
3692                 if (!vnn->tcp_update_needed) {
3693                         continue;
3694                 }
3695                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3696                                 TAKEOVER_TIMEOUT(),
3697                                 CTDB_BROADCAST_CONNECTED,
3698                                 &vnn->public_address,
3699                                 vnn->tcp_array);
3700                 if (ret != 0) {
3701                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3702                                 ctdb_addr_to_str(&vnn->public_address)));
3703                 }
3704         }
3705
3706         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3707                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3708                              ctdb_update_tcp_tickles, ctdb);
3709 }               
3710         
3711
3712 /*
3713   start periodic update of tcp tickles
3714  */
3715 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3716 {
3717         ctdb->tickle_update_context = talloc_new(ctdb);
3718
3719         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3720                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3721                              ctdb_update_tcp_tickles, ctdb);
3722 }
3723
3724
3725
3726
3727 struct control_gratious_arp {
3728         struct ctdb_context *ctdb;
3729         ctdb_sock_addr addr;
3730         const char *iface;
3731         int count;
3732 };
3733
3734 /*
3735   send a control_gratuitous arp
3736  */
3737 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3738                                   struct timeval t, void *private_data)
3739 {
3740         int ret;
3741         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3742                                                         struct control_gratious_arp);
3743
3744         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3745         if (ret != 0) {
3746                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3747                                  arp->iface, strerror(errno)));
3748         }
3749
3750
3751         arp->count++;
3752         if (arp->count == CTDB_ARP_REPEAT) {
3753                 talloc_free(arp);
3754                 return;
3755         }
3756
3757         event_add_timed(arp->ctdb->ev, arp, 
3758                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3759                         send_gratious_arp, arp);
3760 }
3761
3762
3763 /*
3764   send a gratious arp 
3765  */
3766 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3767 {
3768         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3769         struct control_gratious_arp *arp;
3770
3771         /* verify the size of indata */
3772         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3773                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3774                                  (unsigned)indata.dsize, 
3775                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3776                 return -1;
3777         }
3778         if (indata.dsize != 
3779                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3780                 + gratious_arp->len ) ){
3781
3782                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3783                         "but should be %u bytes\n", 
3784                          (unsigned)indata.dsize, 
3785                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3786                 return -1;
3787         }
3788
3789
3790         arp = talloc(ctdb, struct control_gratious_arp);
3791         CTDB_NO_MEMORY(ctdb, arp);
3792
3793         arp->ctdb  = ctdb;
3794         arp->addr   = gratious_arp->addr;
3795         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3796         CTDB_NO_MEMORY(ctdb, arp->iface);
3797         arp->count = 0;
3798         
3799         event_add_timed(arp->ctdb->ev, arp, 
3800                         timeval_zero(), send_gratious_arp, arp);
3801
3802         return 0;
3803 }
3804
3805 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3806 {
3807         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3808         int ret;
3809
3810         /* verify the size of indata */
3811         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3812                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3813                 return -1;
3814         }
3815         if (indata.dsize != 
3816                 ( offsetof(struct ctdb_control_ip_iface, iface)
3817                 + pub->len ) ){
3818
3819                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3820                         "but should be %u bytes\n", 
3821                          (unsigned)indata.dsize, 
3822                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3823                 return -1;
3824         }
3825
3826         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3827
3828         if (ret != 0) {
3829                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3830                 return -1;
3831         }
3832
3833         return 0;
3834 }
3835
3836 /*
3837   called when releaseip event finishes for del_public_address
3838  */
3839 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3840                                 void *private_data)
3841 {
3842         talloc_free(private_data);
3843 }
3844
3845 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3846 {
3847         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3848         struct ctdb_vnn *vnn;
3849         int ret;
3850
3851         /* verify the size of indata */
3852         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3853                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3854                 return -1;
3855         }
3856         if (indata.dsize != 
3857                 ( offsetof(struct ctdb_control_ip_iface, iface)
3858                 + pub->len ) ){
3859
3860                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3861                         "but should be %u bytes\n", 
3862                          (unsigned)indata.dsize, 
3863                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3864                 return -1;
3865         }
3866
3867         /* walk over all public addresses until we find a match */
3868         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3869                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3870                         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3871
3872                         DLIST_REMOVE(ctdb->vnn, vnn);
3873                         talloc_steal(mem_ctx, vnn);
3874                         ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
3875                         if (vnn->pnn != ctdb->pnn) {
3876                                 if (vnn->iface != NULL) {
3877                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3878                                 }
3879                                 talloc_free(mem_ctx);
3880                                 return 0;
3881                         }
3882                         vnn->pnn = -1;
3883
3884                         ret = ctdb_event_script_callback(ctdb, 
3885                                          mem_ctx, delete_ip_callback, mem_ctx,
3886                                          false,
3887                                          CTDB_EVENT_RELEASE_IP,
3888                                          "%s %s %u",
3889                                          ctdb_vnn_iface_string(vnn),
3890                                          ctdb_addr_to_str(&vnn->public_address),
3891                                          vnn->public_netmask_bits);
3892                         if (vnn->iface != NULL) {
3893                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3894                         }
3895                         if (ret != 0) {
3896                                 return -1;
3897                         }
3898                         return 0;
3899                 }
3900         }
3901
3902         return -1;
3903 }
3904
3905
3906 struct ipreallocated_callback_state {
3907         struct ctdb_req_control *c;
3908 };
3909
3910 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3911                                         int status, void *p)
3912 {
3913         struct ipreallocated_callback_state *state =
3914                 talloc_get_type(p, struct ipreallocated_callback_state);
3915
3916         if (status != 0) {
3917                 DEBUG(DEBUG_ERR,
3918                       (" \"ipreallocated\" event script failed (status %d)\n",
3919                        status));
3920                 if (status == -ETIME) {
3921                         ctdb_ban_self(ctdb);
3922                 }
3923         }
3924
3925         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3926         talloc_free(state);
3927 }
3928
3929 /* A control to run the ipreallocated event */
3930 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3931                                    struct ctdb_req_control *c,
3932                                    bool *async_reply)
3933 {
3934         int ret;
3935         struct ipreallocated_callback_state *state;
3936
3937         state = talloc(ctdb, struct ipreallocated_callback_state);
3938         CTDB_NO_MEMORY(ctdb, state);
3939
3940         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3941
3942         ret = ctdb_event_script_callback(ctdb, state,
3943                                          ctdb_ipreallocated_callback, state,
3944                                          false, CTDB_EVENT_IPREALLOCATED,
3945                                          "%s", "");
3946
3947         if (ret != 0) {
3948                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3949                 talloc_free(state);
3950                 return -1;
3951         }
3952
3953         /* tell the control that we will be reply asynchronously */
3954         state->c    = talloc_steal(state, c);
3955         *async_reply = true;
3956
3957         return 0;
3958 }
3959
3960
3961 /* This function is called from the recovery daemon to verify that a remote
3962    node has the expected ip allocation.
3963    This is verified against ctdb->ip_tree
3964 */
3965 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3966 {
3967         struct ctdb_public_ip_list *tmp_ip; 
3968         int i;
3969
3970         if (ctdb->ip_tree == NULL) {
3971                 /* dont know the expected allocation yet, assume remote node
3972                    is correct. */
3973                 return 0;
3974         }
3975
3976         if (ips == NULL) {
3977                 return 0;
3978         }
3979
3980         for (i=0; i<ips->num; i++) {
3981                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3982                 if (tmp_ip == NULL) {
3983                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3984                         return -1;
3985                 }
3986
3987                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3988                         continue;
3989                 }
3990
3991                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3992                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3993                         return -1;
3994                 }
3995         }
3996
3997         return 0;
3998 }
3999
4000 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4001 {
4002         struct ctdb_public_ip_list *tmp_ip; 
4003
4004         if (ctdb->ip_tree == NULL) {
4005                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4006                 return -1;
4007         }
4008
4009         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4010         if (tmp_ip == NULL) {
4011                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4012                 return -1;
4013         }
4014
4015         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4016         tmp_ip->pnn = ip->pnn;
4017
4018         return 0;
4019 }
4020
4021
4022 struct ctdb_reloadips_handle {
4023         struct ctdb_context *ctdb;
4024         struct ctdb_req_control *c;
4025         int status;
4026         int fd[2];
4027         pid_t child;
4028         struct fd_event *fde;
4029 };
4030
4031 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4032 {
4033         if (h == h->ctdb->reload_ips) {
4034                 h->ctdb->reload_ips = NULL;
4035         }
4036         if (h->c != NULL) {
4037                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4038                 h->c = NULL;
4039         }
4040         ctdb_kill(h->ctdb, h->child, SIGKILL);
4041         return 0;
4042 }
4043
4044 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4045                                 struct timed_event *te,
4046                                 struct timeval t, void *private_data)
4047 {
4048         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4049
4050         talloc_free(h);
4051 }       
4052
4053 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
4054                              uint16_t flags, void *private_data)
4055 {
4056         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4057
4058         char res;
4059         int ret;
4060
4061         ret = read(h->fd[0], &res, 1);
4062         if (ret < 1 || res != 0) {
4063                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4064                 res = 1;
4065         }
4066         h->status = res;
4067
4068         talloc_free(h);
4069 }
4070
4071 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4072 {
4073         TALLOC_CTX *mem_ctx = talloc_new(NULL);
4074         struct ctdb_all_public_ips *ips;
4075         struct ctdb_vnn *vnn;
4076         int i, ret;
4077
4078         /* read the ip allocation from the local node */
4079         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4080         if (ret != 0) {
4081                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4082                 talloc_free(mem_ctx);
4083                 return -1;
4084         }
4085
4086         /* re-read the public ips file */
4087         ctdb->vnn = NULL;
4088         if (ctdb_set_public_addresses(ctdb, false) != 0) {
4089                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4090                 talloc_free(mem_ctx);
4091                 return -1;
4092         }               
4093
4094
4095         /* check the previous list of ips and scan for ips that have been
4096            dropped.
4097          */
4098         for (i = 0; i < ips->num; i++) {
4099                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4100                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4101                                 break;
4102                         }
4103                 }
4104
4105                 /* we need to delete this ip, no longer available on this node */
4106                 if (vnn == NULL) {
4107                         struct ctdb_control_ip_iface pub;
4108
4109                         DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4110                         pub.addr  = ips->ips[i].addr;
4111                         pub.mask  = 0;
4112                         pub.len   = 0;
4113
4114                         ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4115                         if (ret != 0) {
4116                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4117                                 return -1;
4118                         }
4119                 }
4120         }
4121
4122
4123         /* loop over all new ones and check the ones we need to add */
4124         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4125                 for (i = 0; i < ips->num; i++) {
4126                         if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4127                                 break;
4128                         }
4129                 }
4130                 if (i == ips->num) {
4131                         struct ctdb_control_ip_iface pub;
4132                         const char *ifaces = NULL;
4133                         int iface = 0;
4134
4135                         DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4136
4137                         pub.addr  = vnn->public_address;
4138                         pub.mask  = vnn->public_netmask_bits;
4139
4140
4141                         ifaces = vnn->ifaces[0];
4142                         iface = 1;
4143                         while (vnn->ifaces[iface] != NULL) {
4144                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4145                                 iface++;
4146                         }
4147                         pub.len   = strlen(ifaces)+1;
4148                         memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4149
4150                         ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4151                         if (ret != 0) {
4152                                 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4153                                 return -1;
4154                         }
4155                 }
4156         }
4157
4158         return 0;
4159 }
4160
4161 /* This control is sent to force the node to re-read the public addresses file
4162    and drop any addresses we should nnot longer host, and add new addresses
4163    that we are now able to host
4164 */
4165 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4166 {
4167         struct ctdb_reloadips_handle *h;
4168         pid_t parent = getpid();
4169
4170         if (ctdb->reload_ips != NULL) {
4171                 talloc_free(ctdb->reload_ips);
4172                 ctdb->reload_ips = NULL;
4173         }
4174
4175         h = talloc(ctdb, struct ctdb_reloadips_handle);
4176         CTDB_NO_MEMORY(ctdb, h);
4177         h->ctdb     = ctdb;
4178         h->c        = NULL;
4179         h->status   = -1;
4180         
4181         if (pipe(h->fd) == -1) {
4182                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4183                 talloc_free(h);
4184                 return -1;
4185         }
4186
4187         h->child = ctdb_fork(ctdb);
4188         if (h->child == (pid_t)-1) {
4189                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4190                 close(h->fd[0]);
4191                 close(h->fd[1]);
4192                 talloc_free(h);
4193                 return -1;
4194         }
4195
4196         /* child process */
4197         if (h->child == 0) {
4198                 signed char res = 0;
4199
4200                 close(h->fd[0]);
4201                 debug_extra = talloc_asprintf(NULL, "reloadips:");
4202
4203                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4204                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4205                         res = -1;
4206                 } else {
4207                         res = ctdb_reloadips_child(ctdb);
4208                         if (res != 0) {
4209                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4210                         }
4211                 }
4212
4213                 write(h->fd[1], &res, 1);
4214                 /* make sure we die when our parent dies */
4215                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4216                         sleep(5);
4217                 }
4218                 _exit(0);
4219         }
4220
4221         h->c             = talloc_steal(h, c);
4222
4223         close(h->fd[1]);
4224         set_close_on_exec(h->fd[0]);
4225
4226         talloc_set_destructor(h, ctdb_reloadips_destructor);
4227
4228
4229         h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4230                         EVENT_FD_READ, ctdb_reloadips_child_handler,
4231                         (void *)h);
4232         tevent_fd_set_auto_close(h->fde);
4233
4234         event_add_timed(ctdb->ev, h,
4235                         timeval_current_ofs(120, 0),
4236                         ctdb_reloadips_timeout_event, h);
4237
4238         /* we reply later */
4239         *async_reply = true;
4240         return 0;
4241 }