20311724834e71988fbfc104576bb89e9b935ce3
[ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         /* Verify that we dont have an entry for this ip yet */
884         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
885                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
886                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
887                                 ctdb_addr_to_str(addr)));
888                         return -1;
889                 }               
890         }
891
892         /* create a new vnn structure for this ip address */
893         vnn = talloc_zero(ctdb, struct ctdb_vnn);
894         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
895         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
896         tmp = talloc_strdup(vnn, ifaces);
897         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
898         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
899                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
900                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
901                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
902                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
903                 num++;
904         }
905         talloc_free(tmp);
906         vnn->ifaces[num] = NULL;
907         vnn->public_address      = *addr;
908         vnn->public_netmask_bits = mask;
909         vnn->pnn                 = -1;
910         if (ctdb_sys_have_ip(addr)) {
911                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
912                 vnn->pnn = ctdb->pnn;
913         }
914
915         for (i=0; vnn->ifaces[i]; i++) {
916                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
917                 if (ret != 0) {
918                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
919                                            "for public_address[%s]\n",
920                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
921                         talloc_free(vnn);
922                         return -1;
923                 }
924                 if (i == 0) {
925                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
926                 }
927         }
928
929         DLIST_ADD(ctdb->vnn, vnn);
930
931         return 0;
932 }
933
934 /*
935   setup the event script directory
936 */
937 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
938 {
939         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
940         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
941         return 0;
942 }
943
944 /*
945   setup the public address lists from a file
946 */
947 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
948 {
949         char **lines;
950         int nlines;
951         int i;
952
953         lines = file_lines_load(alist, &nlines, ctdb);
954         if (lines == NULL) {
955                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
956                 return -1;
957         }
958         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
959                 nlines--;
960         }
961
962         for (i=0;i<nlines;i++) {
963                 unsigned mask;
964                 ctdb_sock_addr addr;
965                 const char *addrstr;
966                 const char *ifaces;
967                 char *tok, *line;
968
969                 line = lines[i];
970                 while ((*line == ' ') || (*line == '\t')) {
971                         line++;
972                 }
973                 if (*line == '#') {
974                         continue;
975                 }
976                 if (strcmp(line, "") == 0) {
977                         continue;
978                 }
979                 tok = strtok(line, " \t");
980                 addrstr = tok;
981                 tok = strtok(NULL, " \t");
982                 if (tok == NULL) {
983                         if (NULL == ctdb->default_public_interface) {
984                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
985                                          i+1));
986                                 talloc_free(lines);
987                                 return -1;
988                         }
989                         ifaces = ctdb->default_public_interface;
990                 } else {
991                         ifaces = tok;
992                 }
993
994                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
995                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
996                         talloc_free(lines);
997                         return -1;
998                 }
999                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1000                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1001                         talloc_free(lines);
1002                         return -1;
1003                 }
1004         }
1005
1006         talloc_free(lines);
1007         return 0;
1008 }
1009
1010 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1011                               const char *iface,
1012                               const char *ip)
1013 {
1014         struct ctdb_vnn *svnn;
1015         struct ctdb_iface *cur = NULL;
1016         bool ok;
1017         int ret;
1018
1019         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1020         CTDB_NO_MEMORY(ctdb, svnn);
1021
1022         svnn->ifaces = talloc_array(svnn, const char *, 2);
1023         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1024         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1025         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1026         svnn->ifaces[1] = NULL;
1027
1028         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1029         if (!ok) {
1030                 talloc_free(svnn);
1031                 return -1;
1032         }
1033
1034         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1035         if (ret != 0) {
1036                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1037                                    "for single_ip[%s]\n",
1038                                    svnn->ifaces[0],
1039                                    ctdb_addr_to_str(&svnn->public_address)));
1040                 talloc_free(svnn);
1041                 return -1;
1042         }
1043
1044         /* assume the single public ip interface is initially "good" */
1045         cur = ctdb_find_iface(ctdb, iface);
1046         if (cur == NULL) {
1047                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1048                 return -1;
1049         }
1050         cur->link_up = true;
1051
1052         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1053         if (ret != 0) {
1054                 talloc_free(svnn);
1055                 return -1;
1056         }
1057
1058         ctdb->single_ip_vnn = svnn;
1059         return 0;
1060 }
1061
1062 /* Given a physical node, return the number of
1063    public addresses that is currently assigned to this node.
1064 */
1065 static int node_ip_coverage(struct ctdb_context *ctdb, 
1066         int32_t pnn,
1067         struct ctdb_public_ip_list *ips)
1068 {
1069         int num=0;
1070
1071         for (;ips;ips=ips->next) {
1072                 if (ips->pnn == pnn) {
1073                         num++;
1074                 }
1075         }
1076         return num;
1077 }
1078
1079
1080 /* Check if this is a public ip known to the node, i.e. can that
1081    node takeover this ip ?
1082 */
1083 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1084                 struct ctdb_public_ip_list *ip)
1085 {
1086         struct ctdb_all_public_ips *public_ips;
1087         int i;
1088
1089         public_ips = ctdb->nodes[pnn]->available_public_ips;
1090
1091         if (public_ips == NULL) {
1092                 return -1;
1093         }
1094
1095         for (i=0;i<public_ips->num;i++) {
1096                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1097                         /* yes, this node can serve this public ip */
1098                         return 0;
1099                 }
1100         }
1101
1102         return -1;
1103 }
1104
1105
1106 /* search the node lists list for a node to takeover this ip.
1107    pick the node that currently are serving the least number of ips
1108    so that the ips get spread out evenly.
1109 */
1110 static int find_takeover_node(struct ctdb_context *ctdb, 
1111                 struct ctdb_node_map *nodemap, uint32_t mask, 
1112                 struct ctdb_public_ip_list *ip,
1113                 struct ctdb_public_ip_list *all_ips)
1114 {
1115         int pnn, min=0, num;
1116         int i;
1117
1118         pnn    = -1;
1119         for (i=0;i<nodemap->num;i++) {
1120                 if (nodemap->nodes[i].flags & mask) {
1121                         /* This node is not healty and can not be used to serve
1122                            a public address 
1123                         */
1124                         continue;
1125                 }
1126
1127                 /* verify that this node can serve this ip */
1128                 if (can_node_serve_ip(ctdb, i, ip)) {
1129                         /* no it couldnt   so skip to the next node */
1130                         continue;
1131                 }
1132
1133                 num = node_ip_coverage(ctdb, i, all_ips);
1134                 /* was this the first node we checked ? */
1135                 if (pnn == -1) {
1136                         pnn = i;
1137                         min  = num;
1138                 } else {
1139                         if (num < min) {
1140                                 pnn = i;
1141                                 min  = num;
1142                         }
1143                 }
1144         }       
1145         if (pnn == -1) {
1146                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1147                         ctdb_addr_to_str(&ip->addr)));
1148
1149                 return -1;
1150         }
1151
1152         ip->pnn = pnn;
1153         return 0;
1154 }
1155
1156 #define IP_KEYLEN       4
1157 static uint32_t *ip_key(ctdb_sock_addr *ip)
1158 {
1159         static uint32_t key[IP_KEYLEN];
1160
1161         bzero(key, sizeof(key));
1162
1163         switch (ip->sa.sa_family) {
1164         case AF_INET:
1165                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1166                 break;
1167         case AF_INET6:
1168                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1169                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1170                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1171                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1172                 break;
1173         default:
1174                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1175                 return key;
1176         }
1177
1178         return key;
1179 }
1180
1181 static void *add_ip_callback(void *parm, void *data)
1182 {
1183         struct ctdb_public_ip_list *this_ip = parm; 
1184         struct ctdb_public_ip_list *prev_ip = data; 
1185
1186         if (prev_ip == NULL) {
1187                 return parm;
1188         }
1189         if (this_ip->pnn == -1) {
1190                 this_ip->pnn = prev_ip->pnn;
1191         }
1192
1193         return parm;
1194 }
1195
1196 void getips_count_callback(void *param, void *data)
1197 {
1198         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1199         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1200
1201         new_ip->next = *ip_list;
1202         *ip_list     = new_ip;
1203 }
1204
1205 static struct ctdb_public_ip_list *
1206 create_merged_ip_list(struct ctdb_context *ctdb)
1207 {
1208         int i, j;
1209         struct ctdb_public_ip_list *ip_list;
1210         struct ctdb_all_public_ips *public_ips;
1211
1212         if (ctdb->ip_tree != NULL) {
1213                 talloc_free(ctdb->ip_tree);
1214                 ctdb->ip_tree = NULL;
1215         }
1216         ctdb->ip_tree = trbt_create(ctdb, 0);
1217
1218         for (i=0;i<ctdb->num_nodes;i++) {
1219                 public_ips = ctdb->nodes[i]->known_public_ips;
1220
1221                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1222                         continue;
1223                 }
1224
1225                 /* there were no public ips for this node */
1226                 if (public_ips == NULL) {
1227                         continue;
1228                 }               
1229
1230                 for (j=0;j<public_ips->num;j++) {
1231                         struct ctdb_public_ip_list *tmp_ip; 
1232
1233                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1234                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1235                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1236                         tmp_ip->addr = public_ips->ips[j].addr;
1237                         tmp_ip->next = NULL;
1238
1239                         trbt_insertarray32_callback(ctdb->ip_tree,
1240                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1241                                 add_ip_callback,
1242                                 tmp_ip);
1243                 }
1244         }
1245
1246         ip_list = NULL;
1247         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1248
1249         return ip_list;
1250 }
1251
1252 /* 
1253  * This is the length of the longtest common prefix between the IPs.
1254  * It is calculated by XOR-ing the 2 IPs together and counting the
1255  * number of leading zeroes.  The implementation means that all
1256  * addresses end up being 128 bits long.
1257  * Not static, so we can easily link it into a unit test.
1258  *
1259  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1260  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1261  * lots of nodes and IP addresses?
1262  */
1263 uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1264 {
1265         uint32_t ip1_k[IP_KEYLEN];
1266         uint32_t *t;
1267         int i;
1268         uint32_t x;
1269
1270         uint32_t distance = 0;
1271
1272         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1273         t = ip_key(ip2);
1274         for (i=0; i<IP_KEYLEN; i++) {
1275                 x = ip1_k[i] ^ t[i];
1276                 if (x == 0) {
1277                         distance += 32;
1278                 } else {
1279                         /* Count number of leading zeroes. 
1280                          * FIXME? This could be optimised...
1281                          */
1282                         while ((x & (1 << 31)) == 0) {
1283                                 x <<= 1;
1284                                 distance += 1;
1285                         }
1286                 }
1287         }
1288
1289         return distance;
1290 }
1291
1292 /* Calculate the IP distance for the given IP relative to IPs on the
1293    given node.  The ips argument is generally the all_ips variable
1294    used in the main part of the algorithm.
1295  * Not static, so we can easily link it into a unit test.
1296  */
1297 uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1298                            struct ctdb_public_ip_list *ips,
1299                            int pnn)
1300 {
1301         struct ctdb_public_ip_list *t;
1302         uint32_t d;
1303
1304         uint32_t sum = 0;
1305
1306         for (t=ips; t != NULL; t=t->next) {
1307                 if (t->pnn != pnn) {
1308                         continue;
1309                 }
1310
1311                 /* Optimisation: We never calculate the distance
1312                  * between an address and itself.  This allows us to
1313                  * calculate the effect of removing an address from a
1314                  * node by simply calculating the distance between
1315                  * that address and all of the exitsing addresses.
1316                  * Moreover, we assume that we're only ever dealing
1317                  * with addresses from all_ips so we can identify an
1318                  * address via a pointer rather than doing a more
1319                  * expensive address comparison. */
1320                 if (&(t->addr) == ip) {
1321                         continue;
1322                 }
1323
1324                 d = ip_distance(ip, &(t->addr));
1325                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1326         }
1327
1328         return sum;
1329 }
1330
1331 /* Return the LCP2 imbalance metric for addresses currently assigned
1332    to the given node.
1333  * Not static, so we can easily link it into a unit test.
1334  */
1335 uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1336 {
1337         struct ctdb_public_ip_list *t;
1338
1339         uint32_t imbalance = 0;
1340
1341         for (t=all_ips; t!=NULL; t=t->next) {
1342                 if (t->pnn != pnn) {
1343                         continue;
1344                 }
1345                 /* Pass the rest of the IPs rather than the whole
1346                    all_ips input list.
1347                 */
1348                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1349         }
1350
1351         return imbalance;
1352 }
1353
1354 /* Allocate any unassigned IPs just by looping through the IPs and
1355  * finding the best node for each.
1356  * Not static, so we can easily link it into a unit test.
1357  */
1358 void basic_allocate_unassigned(struct ctdb_context *ctdb,
1359                                struct ctdb_node_map *nodemap,
1360                                uint32_t mask,
1361                                struct ctdb_public_ip_list *all_ips)
1362 {
1363         struct ctdb_public_ip_list *tmp_ip;
1364
1365         /* loop over all ip's and find a physical node to cover for 
1366            each unassigned ip.
1367         */
1368         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1369                 if (tmp_ip->pnn == -1) {
1370                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1371                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1372                                         ctdb_addr_to_str(&tmp_ip->addr)));
1373                         }
1374                 }
1375         }
1376 }
1377
1378 /* Basic non-deterministic rebalancing algorithm.
1379  * Not static, so we can easily link it into a unit test.
1380  */
1381 bool basic_failback(struct ctdb_context *ctdb,
1382                     struct ctdb_node_map *nodemap,
1383                     uint32_t mask,
1384                     struct ctdb_public_ip_list *all_ips,
1385                     int num_ips,
1386                     int *retries)
1387 {
1388         int i;
1389         int maxnode, maxnum=0, minnode, minnum=0, num;
1390         struct ctdb_public_ip_list *tmp_ip;
1391
1392         /* for each ip address, loop over all nodes that can serve
1393            this ip and make sure that the difference between the node
1394            serving the most and the node serving the least ip's are
1395            not greater than 1.
1396         */
1397         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1398                 if (tmp_ip->pnn == -1) {
1399                         continue;
1400                 }
1401
1402                 /* Get the highest and lowest number of ips's served by any 
1403                    valid node which can serve this ip.
1404                 */
1405                 maxnode = -1;
1406                 minnode = -1;
1407                 for (i=0;i<nodemap->num;i++) {
1408                         if (nodemap->nodes[i].flags & mask) {
1409                                 continue;
1410                         }
1411
1412                         /* only check nodes that can actually serve this ip */
1413                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1414                                 /* no it couldnt   so skip to the next node */
1415                                 continue;
1416                         }
1417
1418                         num = node_ip_coverage(ctdb, i, all_ips);
1419                         if (maxnode == -1) {
1420                                 maxnode = i;
1421                                 maxnum  = num;
1422                         } else {
1423                                 if (num > maxnum) {
1424                                         maxnode = i;
1425                                         maxnum  = num;
1426                                 }
1427                         }
1428                         if (minnode == -1) {
1429                                 minnode = i;
1430                                 minnum  = num;
1431                         } else {
1432                                 if (num < minnum) {
1433                                         minnode = i;
1434                                         minnum  = num;
1435                                 }
1436                         }
1437                 }
1438                 if (maxnode == -1) {
1439                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1440                                 ctdb_addr_to_str(&tmp_ip->addr)));
1441
1442                         continue;
1443                 }
1444
1445                 /* If we want deterministic IPs then dont try to reallocate 
1446                    them to spread out the load.
1447                 */
1448                 if (1 == ctdb->tunable.deterministic_public_ips) {
1449                         continue;
1450                 }
1451
1452                 /* if the spread between the smallest and largest coverage by
1453                    a node is >=2 we steal one of the ips from the node with
1454                    most coverage to even things out a bit.
1455                    try to do this a limited number of times since we dont
1456                    want to spend too much time balancing the ip coverage.
1457                 */
1458                 if ( (maxnum > minnum+1)
1459                      && (*retries < (num_ips + 5)) ){
1460                         struct ctdb_public_ip_list *tmp;
1461
1462                         /* mark one of maxnode's vnn's as unassigned and try
1463                            again
1464                         */
1465                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1466                                 if (tmp->pnn == maxnode) {
1467                                         tmp->pnn = -1;
1468                                         (*retries)++;
1469                                         return true;
1470                                 }
1471                         }
1472                 }
1473         }
1474
1475         return false;
1476 }
1477
1478 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1479  * that we can unit test it.
1480  * Not static, so we can easily link it into a unit test.
1481  */
1482 void lcp2_init(struct ctdb_context * tmp_ctx,
1483                struct ctdb_node_map * nodemap,
1484                uint32_t mask,
1485                struct ctdb_public_ip_list *all_ips,
1486                uint32_t **lcp2_imbalances,
1487                bool **newly_healthy)
1488 {
1489         int i;
1490         struct ctdb_public_ip_list *tmp_ip;
1491
1492         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1493         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1494         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1495         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1496
1497         for (i=0;i<nodemap->num;i++) {
1498                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1499                 /* First step: is the node "healthy"? */
1500                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1501         }
1502
1503         /* 2nd step: if a ndoe has IPs assigned then it must have been
1504          * healthy before, so we remove it from consideration... */
1505         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1506                 if (tmp_ip->pnn != -1) {
1507                         (*newly_healthy)[tmp_ip->pnn] = false;
1508                 }
1509         }
1510 }
1511
1512 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1513  * the IP/node combination that will cost the least.
1514  * Not static, so we can easily link it into a unit test.
1515  */
1516 void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1517                               struct ctdb_node_map *nodemap,
1518                               uint32_t mask,
1519                               struct ctdb_public_ip_list *all_ips,
1520                               uint32_t *lcp2_imbalances)
1521 {
1522         struct ctdb_public_ip_list *tmp_ip;
1523         int dstnode;
1524
1525         int minnode;
1526         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1527         struct ctdb_public_ip_list *minip;
1528
1529         bool should_loop = true;
1530         bool have_unassigned = true;
1531
1532         while (have_unassigned && should_loop) {
1533                 should_loop = false;
1534
1535                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1536                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1537
1538                 minnode = -1;
1539                 mindsum = 0;
1540                 minip = NULL;
1541
1542                 /* loop over each unassigned ip. */
1543                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1544                         if (tmp_ip->pnn != -1) {
1545                                 continue;
1546                         }
1547
1548                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1549                                 /* only check nodes that can actually serve this ip */
1550                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1551                                         /* no it couldnt   so skip to the next node */
1552                                         continue;
1553                                 }
1554                                 if (nodemap->nodes[dstnode].flags & mask) {
1555                                         continue;
1556                                 }
1557
1558                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1559                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1560                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1561                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1562                                                    dstnode,
1563                                                    dstimbl - lcp2_imbalances[dstnode]));
1564
1565
1566                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1567                                         minnode = dstnode;
1568                                         minimbl = dstimbl;
1569                                         mindsum = dstdsum;
1570                                         minip = tmp_ip;
1571                                         should_loop = true;
1572                                 }
1573                         }
1574                 }
1575
1576                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1577
1578                 /* If we found one then assign it to the given node. */
1579                 if (minnode != -1) {
1580                         minip->pnn = minnode;
1581                         lcp2_imbalances[minnode] = minimbl;
1582                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1583                                           ctdb_addr_to_str(&(minip->addr)),
1584                                           minnode,
1585                                           mindsum));
1586                 }
1587
1588                 /* There might be a better way but at least this is clear. */
1589                 have_unassigned = false;
1590                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1591                         if (tmp_ip->pnn == -1) {
1592                                 have_unassigned = true;
1593                         }
1594                 }
1595         }
1596
1597         /* We know if we have an unassigned addresses so we might as
1598          * well optimise.
1599          */
1600         if (have_unassigned) {
1601                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                         if (tmp_ip->pnn == -1) {
1603                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1604                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1605                         }
1606                 }
1607         }
1608 }
1609
1610 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
1611  * to move IPs from, determines the best IP/destination node
1612  * combination to move from the source node.
1613  *
1614  * Not static, so we can easily link it into a unit test.
1615  */
1616 bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1617                              struct ctdb_node_map *nodemap,
1618                              struct ctdb_public_ip_list *all_ips,
1619                              int srcnode,
1620                              uint32_t candimbl,
1621                              uint32_t *lcp2_imbalances,
1622                              bool *newly_healthy)
1623 {
1624         int dstnode, mindstnode;
1625         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1626         uint32_t minsrcimbl, mindstimbl;
1627         struct ctdb_public_ip_list *minip;
1628         struct ctdb_public_ip_list *tmp_ip;
1629
1630         /* Find an IP and destination node that best reduces imbalance. */
1631         minip = NULL;
1632         minsrcimbl = 0;
1633         mindstnode = -1;
1634         mindstimbl = 0;
1635
1636         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1637         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1638
1639         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1640                 /* Only consider addresses on srcnode. */
1641                 if (tmp_ip->pnn != srcnode) {
1642                         continue;
1643                 }
1644
1645                 /* What is this IP address costing the source node? */
1646                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1647                 srcimbl = candimbl - srcdsum;
1648
1649                 /* Consider this IP address would cost each potential
1650                  * destination node.  Destination nodes are limited to
1651                  * those that are newly healthy, since we don't want
1652                  * to do gratuitous failover of IPs just to make minor
1653                  * balance improvements.
1654                  */
1655                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1656                         if (! newly_healthy[dstnode]) {
1657                                 continue;
1658                         }
1659                         /* only check nodes that can actually serve this ip */
1660                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1661                                 /* no it couldnt   so skip to the next node */
1662                                 continue;
1663                         }
1664
1665                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1666                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1667                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1668                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1669                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1670                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1671
1672                         if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1673                             ((mindstnode == -1) ||                              \
1674                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1675
1676                                 minip = tmp_ip;
1677                                 minsrcimbl = srcimbl;
1678                                 mindstnode = dstnode;
1679                                 mindstimbl = dstimbl;
1680                         }
1681                 }
1682         }
1683         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1684
1685         if (mindstnode != -1) {
1686                 /* We found a move that makes things better... */
1687                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1688                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1689                                   ctdb_addr_to_str(&(minip->addr)),
1690                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1691
1692
1693                 lcp2_imbalances[srcnode] = srcimbl;
1694                 lcp2_imbalances[mindstnode] = mindstimbl;
1695                 minip->pnn = mindstnode;
1696
1697                 return true;
1698         }
1699
1700         return false;
1701         
1702 }
1703
1704 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1705  * node with the highest LCP2 imbalance, and then determines the best
1706  * IP/destination node combination to move from the source node.
1707  *
1708  * Not static, so we can easily link it into a unit test.
1709  */
1710 bool lcp2_failback(struct ctdb_context *ctdb,
1711                    struct ctdb_node_map *nodemap,
1712                    uint32_t mask,
1713                    struct ctdb_public_ip_list *all_ips,
1714                    uint32_t *lcp2_imbalances,
1715                    bool *newly_healthy)
1716 {
1717         int srcnode, i, num_newly_healthy;
1718         uint32_t maximbl, b;
1719
1720         /* It is only worth continuing if we have suitable target
1721          * nodes to transfer IPs to.  This check is much cheaper than
1722          * continuing on...
1723          */
1724         num_newly_healthy = 0;
1725         for (i = 0; i < nodemap->num; i++) {
1726                 if (newly_healthy[i]) {
1727                         num_newly_healthy++;
1728                 }
1729         }
1730         if (num_newly_healthy == 0) {
1731                 return false;
1732         }
1733
1734         /* Get the node with the highest imbalance metric. */
1735         srcnode = -1;
1736         maximbl = 0;
1737         for (i=0; i < nodemap->num; i++) {
1738                 b = lcp2_imbalances[i];
1739                 if ((srcnode == -1) || (b > maximbl)) {
1740                         srcnode = i;
1741                         maximbl = b;
1742                 }
1743         }
1744
1745         /* This means that all nodes had 0 or 1 addresses, so can't be
1746          * imbalanced.
1747          */
1748         if (maximbl == 0) {
1749                 return false;
1750         }
1751
1752         return lcp2_failback_candidate(ctdb,
1753                                        nodemap,
1754                                        all_ips,
1755                                        srcnode,
1756                                        maximbl,
1757                                        lcp2_imbalances,
1758                                        newly_healthy);
1759 }
1760
1761 /* The calculation part of the IP allocation algorithm.
1762  * Not static, so we can easily link it into a unit test.
1763  */
1764 void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1765                             struct ctdb_node_map *nodemap,
1766                             struct ctdb_public_ip_list **all_ips_p)
1767 {
1768         int i, num_healthy, retries, num_ips;
1769         uint32_t mask;
1770         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1771         uint32_t *lcp2_imbalances;
1772         bool *newly_healthy;
1773
1774         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1775
1776         /* Count how many completely healthy nodes we have */
1777         num_healthy = 0;
1778         for (i=0;i<nodemap->num;i++) {
1779                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1780                         num_healthy++;
1781                 }
1782         }
1783
1784         if (num_healthy > 0) {
1785                 /* We have healthy nodes, so only consider them for 
1786                    serving public addresses
1787                 */
1788                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1789         } else {
1790                 /* We didnt have any completely healthy nodes so
1791                    use "disabled" nodes as a fallback
1792                 */
1793                 mask = NODE_FLAGS_INACTIVE;
1794         }
1795
1796         /* since nodes only know about those public addresses that
1797            can be served by that particular node, no single node has
1798            a full list of all public addresses that exist in the cluster.
1799            Walk over all node structures and create a merged list of
1800            all public addresses that exist in the cluster.
1801
1802            keep the tree of ips around as ctdb->ip_tree
1803         */
1804         all_ips = create_merged_ip_list(ctdb);
1805         *all_ips_p = all_ips; /* minimal code changes */
1806
1807         /* Count how many ips we have */
1808         num_ips = 0;
1809         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1810                 num_ips++;
1811         }
1812
1813         /* If we want deterministic ip allocations, i.e. that the ip addresses
1814            will always be allocated the same way for a specific set of
1815            available/unavailable nodes.
1816         */
1817         if (1 == ctdb->tunable.deterministic_public_ips) {              
1818                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1819                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1820                         tmp_ip->pnn = i%nodemap->num;
1821                 }
1822         }
1823
1824
1825         /* mark all public addresses with a masked node as being served by
1826            node -1
1827         */
1828         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1829                 if (tmp_ip->pnn == -1) {
1830                         continue;
1831                 }
1832                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1833                         tmp_ip->pnn = -1;
1834                 }
1835         }
1836
1837         /* verify that the assigned nodes can serve that public ip
1838            and set it to -1 if not
1839         */
1840         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1841                 if (tmp_ip->pnn == -1) {
1842                         continue;
1843                 }
1844                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1845                         /* this node can not serve this ip. */
1846                         tmp_ip->pnn = -1;
1847                 }
1848         }
1849
1850         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1851                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1852         }
1853
1854         /* now we must redistribute all public addresses with takeover node
1855            -1 among the nodes available
1856         */
1857         retries = 0;
1858 try_again:
1859         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1860                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1861         } else {
1862                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1863         }
1864
1865         /* If we dont want ips to fail back after a node becomes healthy
1866            again, we wont even try to reallocat the ip addresses so that
1867            they are evenly spread out.
1868            This can NOT be used at the same time as DeterministicIPs !
1869         */
1870         if (1 == ctdb->tunable.no_ip_failback) {
1871                 if (1 == ctdb->tunable.deterministic_public_ips) {
1872                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1873                 }
1874                 goto finished;
1875         }
1876
1877
1878         /* now, try to make sure the ip adresses are evenly distributed
1879            across the node.
1880         */
1881         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1882                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1883                         goto try_again;
1884                 }
1885         } else {
1886                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1887                         goto try_again;
1888                 }
1889         }
1890
1891         /* finished distributing the public addresses, now just send the 
1892            info out to the nodes
1893         */
1894 finished:
1895
1896         /* at this point ->pnn is the node which will own each IP
1897            or -1 if there is no node that can cover this ip
1898         */
1899
1900         return;
1901 }
1902
1903 /*
1904   make any IP alias changes for public addresses that are necessary 
1905  */
1906 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1907 {
1908         int i;
1909         struct ctdb_public_ip ip;
1910         struct ctdb_public_ipv4 ipv4;
1911         uint32_t *nodes;
1912         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1913         TDB_DATA data;
1914         struct timeval timeout;
1915         struct client_async_data *async_data;
1916         struct ctdb_client_control_state *state;
1917         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1918
1919         /*
1920          * ip failover is completely disabled, just send out the 
1921          * ipreallocated event.
1922          */
1923         if (ctdb->tunable.disable_ip_failover != 0) {
1924                 goto ipreallocated;
1925         }
1926
1927         ZERO_STRUCT(ip);
1928
1929         /* Do the IP reassignment calculations */
1930         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
1931
1932         /* now tell all nodes to delete any alias that they should not
1933            have.  This will be a NOOP on nodes that don't currently
1934            hold the given alias */
1935         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1936         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1937
1938         for (i=0;i<nodemap->num;i++) {
1939                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1940                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1941                         continue;
1942                 }
1943
1944                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1945                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1946                                 /* This node should be serving this
1947                                    vnn so dont tell it to release the ip
1948                                 */
1949                                 continue;
1950                         }
1951                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1952                                 ipv4.pnn = tmp_ip->pnn;
1953                                 ipv4.sin = tmp_ip->addr.ip;
1954
1955                                 timeout = TAKEOVER_TIMEOUT();
1956                                 data.dsize = sizeof(ipv4);
1957                                 data.dptr  = (uint8_t *)&ipv4;
1958                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1959                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1960                                                 data, async_data,
1961                                                 &timeout, NULL);
1962                         } else {
1963                                 ip.pnn  = tmp_ip->pnn;
1964                                 ip.addr = tmp_ip->addr;
1965
1966                                 timeout = TAKEOVER_TIMEOUT();
1967                                 data.dsize = sizeof(ip);
1968                                 data.dptr  = (uint8_t *)&ip;
1969                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1970                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1971                                                 data, async_data,
1972                                                 &timeout, NULL);
1973                         }
1974
1975                         if (state == NULL) {
1976                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1977                                 talloc_free(tmp_ctx);
1978                                 return -1;
1979                         }
1980                 
1981                         ctdb_client_async_add(async_data, state);
1982                 }
1983         }
1984         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1985                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1986                 talloc_free(tmp_ctx);
1987                 return -1;
1988         }
1989         talloc_free(async_data);
1990
1991
1992         /* tell all nodes to get their own IPs */
1993         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1994         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1995         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1996                 if (tmp_ip->pnn == -1) {
1997                         /* this IP won't be taken over */
1998                         continue;
1999                 }
2000
2001                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2002                         ipv4.pnn = tmp_ip->pnn;
2003                         ipv4.sin = tmp_ip->addr.ip;
2004
2005                         timeout = TAKEOVER_TIMEOUT();
2006                         data.dsize = sizeof(ipv4);
2007                         data.dptr  = (uint8_t *)&ipv4;
2008                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2009                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2010                                         data, async_data,
2011                                         &timeout, NULL);
2012                 } else {
2013                         ip.pnn  = tmp_ip->pnn;
2014                         ip.addr = tmp_ip->addr;
2015
2016                         timeout = TAKEOVER_TIMEOUT();
2017                         data.dsize = sizeof(ip);
2018                         data.dptr  = (uint8_t *)&ip;
2019                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
2020                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
2021                                         data, async_data,
2022                                         &timeout, NULL);
2023                 }
2024                 if (state == NULL) {
2025                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2026                         talloc_free(tmp_ctx);
2027                         return -1;
2028                 }
2029                 
2030                 ctdb_client_async_add(async_data, state);
2031         }
2032         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2033                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2034                 talloc_free(tmp_ctx);
2035                 return -1;
2036         }
2037
2038 ipreallocated:
2039         /* tell all nodes to update natwg */
2040         /* send the flags update natgw on all connected nodes */
2041         data.dptr  = discard_const("ipreallocated");
2042         data.dsize = strlen((char *)data.dptr) + 1; 
2043         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2044         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2045                                       nodes, 0, TAKEOVER_TIMEOUT(),
2046                                       false, data,
2047                                       NULL, NULL,
2048                                       NULL) != 0) {
2049                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2050         }
2051
2052         talloc_free(tmp_ctx);
2053         return 0;
2054 }
2055
2056
2057 /*
2058   destroy a ctdb_client_ip structure
2059  */
2060 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2061 {
2062         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2063                 ctdb_addr_to_str(&ip->addr),
2064                 ntohs(ip->addr.ip.sin_port),
2065                 ip->client_id));
2066
2067         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2068         return 0;
2069 }
2070
2071 /*
2072   called by a client to inform us of a TCP connection that it is managing
2073   that should tickled with an ACK when IP takeover is done
2074   we handle both the old ipv4 style of packets as well as the new ipv4/6
2075   pdus.
2076  */
2077 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2078                                 TDB_DATA indata)
2079 {
2080         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2081         struct ctdb_control_tcp *old_addr = NULL;
2082         struct ctdb_control_tcp_addr new_addr;
2083         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2084         struct ctdb_tcp_list *tcp;
2085         struct ctdb_tcp_connection t;
2086         int ret;
2087         TDB_DATA data;
2088         struct ctdb_client_ip *ip;
2089         struct ctdb_vnn *vnn;
2090         ctdb_sock_addr addr;
2091
2092         switch (indata.dsize) {
2093         case sizeof(struct ctdb_control_tcp):
2094                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2095                 ZERO_STRUCT(new_addr);
2096                 tcp_sock = &new_addr;
2097                 tcp_sock->src.ip  = old_addr->src;
2098                 tcp_sock->dest.ip = old_addr->dest;
2099                 break;
2100         case sizeof(struct ctdb_control_tcp_addr):
2101                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2102                 break;
2103         default:
2104                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2105                                  "to ctdb_control_tcp_client. size was %d but "
2106                                  "only allowed sizes are %lu and %lu\n",
2107                                  (int)indata.dsize,
2108                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2109                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2110                 return -1;
2111         }
2112
2113         addr = tcp_sock->src;
2114         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2115         addr = tcp_sock->dest;
2116         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2117
2118         ZERO_STRUCT(addr);
2119         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2120         vnn = find_public_ip_vnn(ctdb, &addr);
2121         if (vnn == NULL) {
2122                 switch (addr.sa.sa_family) {
2123                 case AF_INET:
2124                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2125                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2126                                         ctdb_addr_to_str(&addr)));
2127                         }
2128                         break;
2129                 case AF_INET6:
2130                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2131                                 ctdb_addr_to_str(&addr)));
2132                         break;
2133                 default:
2134                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2135                 }
2136
2137                 return 0;
2138         }
2139
2140         if (vnn->pnn != ctdb->pnn) {
2141                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2142                         ctdb_addr_to_str(&addr),
2143                         client_id, client->pid));
2144                 /* failing this call will tell smbd to die */
2145                 return -1;
2146         }
2147
2148         ip = talloc(client, struct ctdb_client_ip);
2149         CTDB_NO_MEMORY(ctdb, ip);
2150
2151         ip->ctdb      = ctdb;
2152         ip->addr      = addr;
2153         ip->client_id = client_id;
2154         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2155         DLIST_ADD(ctdb->client_ip_list, ip);
2156
2157         tcp = talloc(client, struct ctdb_tcp_list);
2158         CTDB_NO_MEMORY(ctdb, tcp);
2159
2160         tcp->connection.src_addr = tcp_sock->src;
2161         tcp->connection.dst_addr = tcp_sock->dest;
2162
2163         DLIST_ADD(client->tcp_list, tcp);
2164
2165         t.src_addr = tcp_sock->src;
2166         t.dst_addr = tcp_sock->dest;
2167
2168         data.dptr = (uint8_t *)&t;
2169         data.dsize = sizeof(t);
2170
2171         switch (addr.sa.sa_family) {
2172         case AF_INET:
2173                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2174                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2175                         ctdb_addr_to_str(&tcp_sock->src),
2176                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2177                 break;
2178         case AF_INET6:
2179                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2180                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2181                         ctdb_addr_to_str(&tcp_sock->src),
2182                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2183                 break;
2184         default:
2185                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2186         }
2187
2188
2189         /* tell all nodes about this tcp connection */
2190         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2191                                        CTDB_CONTROL_TCP_ADD,
2192                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2193         if (ret != 0) {
2194                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2195                 return -1;
2196         }
2197
2198         return 0;
2199 }
2200
2201 /*
2202   find a tcp address on a list
2203  */
2204 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2205                                            struct ctdb_tcp_connection *tcp)
2206 {
2207         int i;
2208
2209         if (array == NULL) {
2210                 return NULL;
2211         }
2212
2213         for (i=0;i<array->num;i++) {
2214                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2215                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2216                         return &array->connections[i];
2217                 }
2218         }
2219         return NULL;
2220 }
2221
2222
2223
2224 /*
2225   called by a daemon to inform us of a TCP connection that one of its
2226   clients managing that should tickled with an ACK when IP takeover is
2227   done
2228  */
2229 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2230 {
2231         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2232         struct ctdb_tcp_array *tcparray;
2233         struct ctdb_tcp_connection tcp;
2234         struct ctdb_vnn *vnn;
2235
2236         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2237         if (vnn == NULL) {
2238                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2239                         ctdb_addr_to_str(&p->dst_addr)));
2240
2241                 return -1;
2242         }
2243
2244
2245         tcparray = vnn->tcp_array;
2246
2247         /* If this is the first tickle */
2248         if (tcparray == NULL) {
2249                 tcparray = talloc_size(ctdb->nodes, 
2250                         offsetof(struct ctdb_tcp_array, connections) +
2251                         sizeof(struct ctdb_tcp_connection) * 1);
2252                 CTDB_NO_MEMORY(ctdb, tcparray);
2253                 vnn->tcp_array = tcparray;
2254
2255                 tcparray->num = 0;
2256                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2257                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2258
2259                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2260                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2261                 tcparray->num++;
2262
2263                 if (tcp_update_needed) {
2264                         vnn->tcp_update_needed = true;
2265                 }
2266                 return 0;
2267         }
2268
2269
2270         /* Do we already have this tickle ?*/
2271         tcp.src_addr = p->src_addr;
2272         tcp.dst_addr = p->dst_addr;
2273         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2274                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2275                         ctdb_addr_to_str(&tcp.dst_addr),
2276                         ntohs(tcp.dst_addr.ip.sin_port),
2277                         vnn->pnn));
2278                 return 0;
2279         }
2280
2281         /* A new tickle, we must add it to the array */
2282         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2283                                         struct ctdb_tcp_connection,
2284                                         tcparray->num+1);
2285         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2286
2287         vnn->tcp_array = tcparray;
2288         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2289         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2290         tcparray->num++;
2291                                 
2292         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2293                 ctdb_addr_to_str(&tcp.dst_addr),
2294                 ntohs(tcp.dst_addr.ip.sin_port),
2295                 vnn->pnn));
2296
2297         if (tcp_update_needed) {
2298                 vnn->tcp_update_needed = true;
2299         }
2300
2301         return 0;
2302 }
2303
2304
2305 /*
2306   called by a daemon to inform us of a TCP connection that one of its
2307   clients managing that should tickled with an ACK when IP takeover is
2308   done
2309  */
2310 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2311 {
2312         struct ctdb_tcp_connection *tcpp;
2313         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2314
2315         if (vnn == NULL) {
2316                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2317                         ctdb_addr_to_str(&conn->dst_addr)));
2318                 return;
2319         }
2320
2321         /* if the array is empty we cant remove it
2322            and we dont need to do anything
2323          */
2324         if (vnn->tcp_array == NULL) {
2325                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2326                         ctdb_addr_to_str(&conn->dst_addr),
2327                         ntohs(conn->dst_addr.ip.sin_port)));
2328                 return;
2329         }
2330
2331
2332         /* See if we know this connection
2333            if we dont know this connection  then we dont need to do anything
2334          */
2335         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2336         if (tcpp == NULL) {
2337                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2338                         ctdb_addr_to_str(&conn->dst_addr),
2339                         ntohs(conn->dst_addr.ip.sin_port)));
2340                 return;
2341         }
2342
2343
2344         /* We need to remove this entry from the array.
2345            Instead of allocating a new array and copying data to it
2346            we cheat and just copy the last entry in the existing array
2347            to the entry that is to be removed and just shring the 
2348            ->num field
2349          */
2350         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2351         vnn->tcp_array->num--;
2352
2353         /* If we deleted the last entry we also need to remove the entire array
2354          */
2355         if (vnn->tcp_array->num == 0) {
2356                 talloc_free(vnn->tcp_array);
2357                 vnn->tcp_array = NULL;
2358         }               
2359
2360         vnn->tcp_update_needed = true;
2361
2362         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2363                 ctdb_addr_to_str(&conn->src_addr),
2364                 ntohs(conn->src_addr.ip.sin_port)));
2365 }
2366
2367
2368 /*
2369   called by a daemon to inform us of a TCP connection that one of its
2370   clients used are no longer needed in the tickle database
2371  */
2372 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2373 {
2374         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2375
2376         ctdb_remove_tcp_connection(ctdb, conn);
2377
2378         return 0;
2379 }
2380
2381
2382 /*
2383   called when a daemon restarts - send all tickes for all public addresses
2384   we are serving immediately to the new node.
2385  */
2386 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2387 {
2388 /*XXX here we should send all tickes we are serving to the new node */
2389         return 0;
2390 }
2391
2392
2393 /*
2394   called when a client structure goes away - hook to remove
2395   elements from the tcp_list in all daemons
2396  */
2397 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2398 {
2399         while (client->tcp_list) {
2400                 struct ctdb_tcp_list *tcp = client->tcp_list;
2401                 DLIST_REMOVE(client->tcp_list, tcp);
2402                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2403         }
2404 }
2405
2406
2407 /*
2408   release all IPs on shutdown
2409  */
2410 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2411 {
2412         struct ctdb_vnn *vnn;
2413
2414         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2415                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2416                         ctdb_vnn_unassign_iface(ctdb, vnn);
2417                         continue;
2418                 }
2419                 if (!vnn->iface) {
2420                         continue;
2421                 }
2422                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2423                                   ctdb_vnn_iface_string(vnn),
2424                                   ctdb_addr_to_str(&vnn->public_address),
2425                                   vnn->public_netmask_bits);
2426                 release_kill_clients(ctdb, &vnn->public_address);
2427                 ctdb_vnn_unassign_iface(ctdb, vnn);
2428         }
2429 }
2430
2431
2432 /*
2433   get list of public IPs
2434  */
2435 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2436                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2437 {
2438         int i, num, len;
2439         struct ctdb_all_public_ips *ips;
2440         struct ctdb_vnn *vnn;
2441         bool only_available = false;
2442
2443         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2444                 only_available = true;
2445         }
2446
2447         /* count how many public ip structures we have */
2448         num = 0;
2449         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2450                 num++;
2451         }
2452
2453         len = offsetof(struct ctdb_all_public_ips, ips) + 
2454                 num*sizeof(struct ctdb_public_ip);
2455         ips = talloc_zero_size(outdata, len);
2456         CTDB_NO_MEMORY(ctdb, ips);
2457
2458         i = 0;
2459         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2460                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2461                         continue;
2462                 }
2463                 ips->ips[i].pnn  = vnn->pnn;
2464                 ips->ips[i].addr = vnn->public_address;
2465                 i++;
2466         }
2467         ips->num = i;
2468         len = offsetof(struct ctdb_all_public_ips, ips) +
2469                 i*sizeof(struct ctdb_public_ip);
2470
2471         outdata->dsize = len;
2472         outdata->dptr  = (uint8_t *)ips;
2473
2474         return 0;
2475 }
2476
2477
2478 /*
2479   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2480  */
2481 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2482                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2483 {
2484         int i, num, len;
2485         struct ctdb_all_public_ipsv4 *ips;
2486         struct ctdb_vnn *vnn;
2487
2488         /* count how many public ip structures we have */
2489         num = 0;
2490         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2491                 if (vnn->public_address.sa.sa_family != AF_INET) {
2492                         continue;
2493                 }
2494                 num++;
2495         }
2496
2497         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2498                 num*sizeof(struct ctdb_public_ipv4);
2499         ips = talloc_zero_size(outdata, len);
2500         CTDB_NO_MEMORY(ctdb, ips);
2501
2502         outdata->dsize = len;
2503         outdata->dptr  = (uint8_t *)ips;
2504
2505         ips->num = num;
2506         i = 0;
2507         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2508                 if (vnn->public_address.sa.sa_family != AF_INET) {
2509                         continue;
2510                 }
2511                 ips->ips[i].pnn = vnn->pnn;
2512                 ips->ips[i].sin = vnn->public_address.ip;
2513                 i++;
2514         }
2515
2516         return 0;
2517 }
2518
2519 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2520                                         struct ctdb_req_control *c,
2521                                         TDB_DATA indata,
2522                                         TDB_DATA *outdata)
2523 {
2524         int i, num, len;
2525         ctdb_sock_addr *addr;
2526         struct ctdb_control_public_ip_info *info;
2527         struct ctdb_vnn *vnn;
2528
2529         addr = (ctdb_sock_addr *)indata.dptr;
2530
2531         vnn = find_public_ip_vnn(ctdb, addr);
2532         if (vnn == NULL) {
2533                 /* if it is not a public ip   it could be our 'single ip' */
2534                 if (ctdb->single_ip_vnn) {
2535                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2536                                 vnn = ctdb->single_ip_vnn;
2537                         }
2538                 }
2539         }
2540         if (vnn == NULL) {
2541                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2542                                  "'%s'not a public address\n",
2543                                  ctdb_addr_to_str(addr)));
2544                 return -1;
2545         }
2546
2547         /* count how many public ip structures we have */
2548         num = 0;
2549         for (;vnn->ifaces[num];) {
2550                 num++;
2551         }
2552
2553         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2554                 num*sizeof(struct ctdb_control_iface_info);
2555         info = talloc_zero_size(outdata, len);
2556         CTDB_NO_MEMORY(ctdb, info);
2557
2558         info->ip.addr = vnn->public_address;
2559         info->ip.pnn = vnn->pnn;
2560         info->active_idx = 0xFFFFFFFF;
2561
2562         for (i=0; vnn->ifaces[i]; i++) {
2563                 struct ctdb_iface *cur;
2564
2565                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2566                 if (cur == NULL) {
2567                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2568                                            vnn->ifaces[i]));
2569                         return -1;
2570                 }
2571                 if (vnn->iface == cur) {
2572                         info->active_idx = i;
2573                 }
2574                 strcpy(info->ifaces[i].name, cur->name);
2575                 info->ifaces[i].link_state = cur->link_up;
2576                 info->ifaces[i].references = cur->references;
2577         }
2578         info->num = i;
2579         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2580                 i*sizeof(struct ctdb_control_iface_info);
2581
2582         outdata->dsize = len;
2583         outdata->dptr  = (uint8_t *)info;
2584
2585         return 0;
2586 }
2587
2588 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2589                                 struct ctdb_req_control *c,
2590                                 TDB_DATA *outdata)
2591 {
2592         int i, num, len;
2593         struct ctdb_control_get_ifaces *ifaces;
2594         struct ctdb_iface *cur;
2595
2596         /* count how many public ip structures we have */
2597         num = 0;
2598         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2599                 num++;
2600         }
2601
2602         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2603                 num*sizeof(struct ctdb_control_iface_info);
2604         ifaces = talloc_zero_size(outdata, len);
2605         CTDB_NO_MEMORY(ctdb, ifaces);
2606
2607         i = 0;
2608         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2609                 strcpy(ifaces->ifaces[i].name, cur->name);
2610                 ifaces->ifaces[i].link_state = cur->link_up;
2611                 ifaces->ifaces[i].references = cur->references;
2612                 i++;
2613         }
2614         ifaces->num = i;
2615         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2616                 i*sizeof(struct ctdb_control_iface_info);
2617
2618         outdata->dsize = len;
2619         outdata->dptr  = (uint8_t *)ifaces;
2620
2621         return 0;
2622 }
2623
2624 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2625                                     struct ctdb_req_control *c,
2626                                     TDB_DATA indata)
2627 {
2628         struct ctdb_control_iface_info *info;
2629         struct ctdb_iface *iface;
2630         bool link_up = false;
2631
2632         info = (struct ctdb_control_iface_info *)indata.dptr;
2633
2634         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2635                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2636                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2637                                   len, len, info->name));
2638                 return -1;
2639         }
2640
2641         switch (info->link_state) {
2642         case 0:
2643                 link_up = false;
2644                 break;
2645         case 1:
2646                 link_up = true;
2647                 break;
2648         default:
2649                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2650                                   (unsigned int)info->link_state));
2651                 return -1;
2652         }
2653
2654         if (info->references != 0) {
2655                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2656                                   (unsigned int)info->references));
2657                 return -1;
2658         }
2659
2660         iface = ctdb_find_iface(ctdb, info->name);
2661         if (iface == NULL) {
2662                 return -1;
2663         }
2664
2665         if (link_up == iface->link_up) {
2666                 return 0;
2667         }
2668
2669         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2670               ("iface[%s] has changed it's link status %s => %s\n",
2671                iface->name,
2672                iface->link_up?"up":"down",
2673                link_up?"up":"down"));
2674
2675         iface->link_up = link_up;
2676         return 0;
2677 }
2678
2679
2680 /* 
2681    structure containing the listening socket and the list of tcp connections
2682    that the ctdb daemon is to kill
2683 */
2684 struct ctdb_kill_tcp {
2685         struct ctdb_vnn *vnn;
2686         struct ctdb_context *ctdb;
2687         int capture_fd;
2688         struct fd_event *fde;
2689         trbt_tree_t *connections;
2690         void *private_data;
2691 };
2692
2693 /*
2694   a tcp connection that is to be killed
2695  */
2696 struct ctdb_killtcp_con {
2697         ctdb_sock_addr src_addr;
2698         ctdb_sock_addr dst_addr;
2699         int count;
2700         struct ctdb_kill_tcp *killtcp;
2701 };
2702
2703 /* this function is used to create a key to represent this socketpair
2704    in the killtcp tree.
2705    this key is used to insert and lookup matching socketpairs that are
2706    to be tickled and RST
2707 */
2708 #define KILLTCP_KEYLEN  10
2709 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2710 {
2711         static uint32_t key[KILLTCP_KEYLEN];
2712
2713         bzero(key, sizeof(key));
2714
2715         if (src->sa.sa_family != dst->sa.sa_family) {
2716                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2717                 return key;
2718         }
2719         
2720         switch (src->sa.sa_family) {
2721         case AF_INET:
2722                 key[0]  = dst->ip.sin_addr.s_addr;
2723                 key[1]  = src->ip.sin_addr.s_addr;
2724                 key[2]  = dst->ip.sin_port;
2725                 key[3]  = src->ip.sin_port;
2726                 break;
2727         case AF_INET6:
2728                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2729                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2730                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2731                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2732                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2733                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2734                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2735                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2736                 key[8]  = dst->ip6.sin6_port;
2737                 key[9]  = src->ip6.sin6_port;
2738                 break;
2739         default:
2740                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2741                 return key;
2742         }
2743
2744         return key;
2745 }
2746
2747 /*
2748   called when we get a read event on the raw socket
2749  */
2750 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2751                                 uint16_t flags, void *private_data)
2752 {
2753         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2754         struct ctdb_killtcp_con *con;
2755         ctdb_sock_addr src, dst;
2756         uint32_t ack_seq, seq;
2757
2758         if (!(flags & EVENT_FD_READ)) {
2759                 return;
2760         }
2761
2762         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2763                                 killtcp->private_data,
2764                                 &src, &dst,
2765                                 &ack_seq, &seq) != 0) {
2766                 /* probably a non-tcp ACK packet */
2767                 return;
2768         }
2769
2770         /* check if we have this guy in our list of connections
2771            to kill
2772         */
2773         con = trbt_lookuparray32(killtcp->connections, 
2774                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2775         if (con == NULL) {
2776                 /* no this was some other packet we can just ignore */
2777                 return;
2778         }
2779
2780         /* This one has been tickled !
2781            now reset him and remove him from the list.
2782          */
2783         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2784                 ntohs(con->dst_addr.ip.sin_port),
2785                 ctdb_addr_to_str(&con->src_addr),
2786                 ntohs(con->src_addr.ip.sin_port)));
2787
2788         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2789         talloc_free(con);
2790 }
2791
2792
2793 /* when traversing the list of all tcp connections to send tickle acks to
2794    (so that we can capture the ack coming back and kill the connection
2795     by a RST)
2796    this callback is called for each connection we are currently trying to kill
2797 */
2798 static void tickle_connection_traverse(void *param, void *data)
2799 {
2800         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2801
2802         /* have tried too many times, just give up */
2803         if (con->count >= 5) {
2804                 /* can't delete in traverse: reparent to delete_cons */
2805                 talloc_steal(param, con);
2806                 return;
2807         }
2808
2809         /* othervise, try tickling it again */
2810         con->count++;
2811         ctdb_sys_send_tcp(
2812                 (ctdb_sock_addr *)&con->dst_addr,
2813                 (ctdb_sock_addr *)&con->src_addr,
2814                 0, 0, 0);
2815 }
2816
2817
2818 /* 
2819    called every second until all sentenced connections have been reset
2820  */
2821 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2822                                               struct timeval t, void *private_data)
2823 {
2824         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2825         void *delete_cons = talloc_new(NULL);
2826
2827         /* loop over all connections sending tickle ACKs */
2828         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2829
2830         /* now we've finished traverse, it's safe to do deletion. */
2831         talloc_free(delete_cons);
2832
2833         /* If there are no more connections to kill we can remove the
2834            entire killtcp structure
2835          */
2836         if ( (killtcp->connections == NULL) || 
2837              (killtcp->connections->root == NULL) ) {
2838                 talloc_free(killtcp);
2839                 return;
2840         }
2841
2842         /* try tickling them again in a seconds time
2843          */
2844         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2845                         ctdb_tickle_sentenced_connections, killtcp);
2846 }
2847
2848 /*
2849   destroy the killtcp structure
2850  */
2851 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2852 {
2853         if (killtcp->vnn) {
2854                 killtcp->vnn->killtcp = NULL;
2855         }
2856         return 0;
2857 }
2858
2859
2860 /* nothing fancy here, just unconditionally replace any existing
2861    connection structure with the new one.
2862
2863    dont even free the old one if it did exist, that one is talloc_stolen
2864    by the same node in the tree anyway and will be deleted when the new data 
2865    is deleted
2866 */
2867 static void *add_killtcp_callback(void *parm, void *data)
2868 {
2869         return parm;
2870 }
2871
2872 /*
2873   add a tcp socket to the list of connections we want to RST
2874  */
2875 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2876                                        ctdb_sock_addr *s,
2877                                        ctdb_sock_addr *d)
2878 {
2879         ctdb_sock_addr src, dst;
2880         struct ctdb_kill_tcp *killtcp;
2881         struct ctdb_killtcp_con *con;
2882         struct ctdb_vnn *vnn;
2883
2884         ctdb_canonicalize_ip(s, &src);
2885         ctdb_canonicalize_ip(d, &dst);
2886
2887         vnn = find_public_ip_vnn(ctdb, &dst);
2888         if (vnn == NULL) {
2889                 vnn = find_public_ip_vnn(ctdb, &src);
2890         }
2891         if (vnn == NULL) {
2892                 /* if it is not a public ip   it could be our 'single ip' */
2893                 if (ctdb->single_ip_vnn) {
2894                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2895                                 vnn = ctdb->single_ip_vnn;
2896                         }
2897                 }
2898         }
2899         if (vnn == NULL) {
2900                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2901                 return -1;
2902         }
2903
2904         killtcp = vnn->killtcp;
2905         
2906         /* If this is the first connection to kill we must allocate
2907            a new structure
2908          */
2909         if (killtcp == NULL) {
2910                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2911                 CTDB_NO_MEMORY(ctdb, killtcp);
2912
2913                 killtcp->vnn         = vnn;
2914                 killtcp->ctdb        = ctdb;
2915                 killtcp->capture_fd  = -1;
2916                 killtcp->connections = trbt_create(killtcp, 0);
2917
2918                 vnn->killtcp         = killtcp;
2919                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2920         }
2921
2922
2923
2924         /* create a structure that describes this connection we want to
2925            RST and store it in killtcp->connections
2926         */
2927         con = talloc(killtcp, struct ctdb_killtcp_con);
2928         CTDB_NO_MEMORY(ctdb, con);
2929         con->src_addr = src;
2930         con->dst_addr = dst;
2931         con->count    = 0;
2932         con->killtcp  = killtcp;
2933
2934
2935         trbt_insertarray32_callback(killtcp->connections,
2936                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2937                         add_killtcp_callback, con);
2938
2939         /* 
2940            If we dont have a socket to listen on yet we must create it
2941          */
2942         if (killtcp->capture_fd == -1) {
2943                 const char *iface = ctdb_vnn_iface_string(vnn);
2944                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2945                 if (killtcp->capture_fd == -1) {
2946                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2947                                           "socket on iface '%s' for killtcp (%s)\n",
2948                                           iface, strerror(errno)));
2949                         goto failed;
2950                 }
2951         }
2952
2953
2954         if (killtcp->fde == NULL) {
2955                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2956                                             EVENT_FD_READ,
2957                                             capture_tcp_handler, killtcp);
2958                 tevent_fd_set_auto_close(killtcp->fde);
2959
2960                 /* We also need to set up some events to tickle all these connections
2961                    until they are all reset
2962                 */
2963                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2964                                 ctdb_tickle_sentenced_connections, killtcp);
2965         }
2966
2967         /* tickle him once now */
2968         ctdb_sys_send_tcp(
2969                 &con->dst_addr,
2970                 &con->src_addr,
2971                 0, 0, 0);
2972
2973         return 0;
2974
2975 failed:
2976         talloc_free(vnn->killtcp);
2977         vnn->killtcp = NULL;
2978         return -1;
2979 }
2980
2981 /*
2982   kill a TCP connection.
2983  */
2984 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2985 {
2986         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2987
2988         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2989 }
2990
2991 /*
2992   called by a daemon to inform us of the entire list of TCP tickles for
2993   a particular public address.
2994   this control should only be sent by the node that is currently serving
2995   that public address.
2996  */
2997 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2998 {
2999         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3000         struct ctdb_tcp_array *tcparray;
3001         struct ctdb_vnn *vnn;
3002
3003         /* We must at least have tickles.num or else we cant verify the size
3004            of the received data blob
3005          */
3006         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3007                                         tickles.connections)) {
3008                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3009                 return -1;
3010         }
3011
3012         /* verify that the size of data matches what we expect */
3013         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
3014                                 tickles.connections)
3015                          + sizeof(struct ctdb_tcp_connection)
3016                                  * list->tickles.num) {
3017                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3018                 return -1;
3019         }       
3020
3021         vnn = find_public_ip_vnn(ctdb, &list->addr);
3022         if (vnn == NULL) {
3023                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
3024                         ctdb_addr_to_str(&list->addr)));
3025
3026                 return 1;
3027         }
3028
3029         /* remove any old ticklelist we might have */
3030         talloc_free(vnn->tcp_array);
3031         vnn->tcp_array = NULL;
3032
3033         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3034         CTDB_NO_MEMORY(ctdb, tcparray);
3035
3036         tcparray->num = list->tickles.num;
3037
3038         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3039         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3040
3041         memcpy(tcparray->connections, &list->tickles.connections[0], 
3042                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3043
3044         /* We now have a new fresh tickle list array for this vnn */
3045         vnn->tcp_array = talloc_steal(vnn, tcparray);
3046         
3047         return 0;
3048 }
3049
3050 /*
3051   called to return the full list of tickles for the puclic address associated 
3052   with the provided vnn
3053  */
3054 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3055 {
3056         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3057         struct ctdb_control_tcp_tickle_list *list;
3058         struct ctdb_tcp_array *tcparray;
3059         int num;
3060         struct ctdb_vnn *vnn;
3061
3062         vnn = find_public_ip_vnn(ctdb, addr);
3063         if (vnn == NULL) {
3064                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3065                         ctdb_addr_to_str(addr)));
3066
3067                 return 1;
3068         }
3069
3070         tcparray = vnn->tcp_array;
3071         if (tcparray) {
3072                 num = tcparray->num;
3073         } else {
3074                 num = 0;
3075         }
3076
3077         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3078                                 tickles.connections)
3079                         + sizeof(struct ctdb_tcp_connection) * num;
3080
3081         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3082         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3083         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3084
3085         list->addr = *addr;
3086         list->tickles.num = num;
3087         if (num) {
3088                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3089                         sizeof(struct ctdb_tcp_connection) * num);
3090         }
3091
3092         return 0;
3093 }
3094
3095
3096 /*
3097   set the list of all tcp tickles for a public address
3098  */
3099 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3100                               struct timeval timeout, uint32_t destnode, 
3101                               ctdb_sock_addr *addr,
3102                               struct ctdb_tcp_array *tcparray)
3103 {
3104         int ret, num;
3105         TDB_DATA data;
3106         struct ctdb_control_tcp_tickle_list *list;
3107
3108         if (tcparray) {
3109                 num = tcparray->num;
3110         } else {
3111                 num = 0;
3112         }
3113
3114         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3115                                 tickles.connections) +
3116                         sizeof(struct ctdb_tcp_connection) * num;
3117         data.dptr = talloc_size(ctdb, data.dsize);
3118         CTDB_NO_MEMORY(ctdb, data.dptr);
3119
3120         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3121         list->addr = *addr;
3122         list->tickles.num = num;
3123         if (tcparray) {
3124                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3125         }
3126
3127         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3128                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3129                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3130         if (ret != 0) {
3131                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3132                 return -1;
3133         }
3134
3135         talloc_free(data.dptr);
3136
3137         return ret;
3138 }
3139
3140
3141 /*
3142   perform tickle updates if required
3143  */
3144 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3145                                 struct timed_event *te, 
3146                                 struct timeval t, void *private_data)
3147 {
3148         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3149         int ret;
3150         struct ctdb_vnn *vnn;
3151
3152         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3153                 /* we only send out updates for public addresses that 
3154                    we have taken over
3155                  */
3156                 if (ctdb->pnn != vnn->pnn) {
3157                         continue;
3158                 }
3159                 /* We only send out the updates if we need to */
3160                 if (!vnn->tcp_update_needed) {
3161                         continue;
3162                 }
3163                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3164                                 TAKEOVER_TIMEOUT(),
3165                                 CTDB_BROADCAST_CONNECTED,
3166                                 &vnn->public_address,
3167                                 vnn->tcp_array);
3168                 if (ret != 0) {
3169                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3170                                 ctdb_addr_to_str(&vnn->public_address)));
3171                 }
3172         }
3173
3174         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3175                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3176                              ctdb_update_tcp_tickles, ctdb);
3177 }               
3178         
3179
3180 /*
3181   start periodic update of tcp tickles
3182  */
3183 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3184 {
3185         ctdb->tickle_update_context = talloc_new(ctdb);
3186
3187         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3188                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3189                              ctdb_update_tcp_tickles, ctdb);
3190 }
3191
3192
3193
3194
3195 struct control_gratious_arp {
3196         struct ctdb_context *ctdb;
3197         ctdb_sock_addr addr;
3198         const char *iface;
3199         int count;
3200 };
3201
3202 /*
3203   send a control_gratuitous arp
3204  */
3205 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3206                                   struct timeval t, void *private_data)
3207 {
3208         int ret;
3209         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3210                                                         struct control_gratious_arp);
3211
3212         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3213         if (ret != 0) {
3214                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3215                                  arp->iface, strerror(errno)));
3216         }
3217
3218
3219         arp->count++;
3220         if (arp->count == CTDB_ARP_REPEAT) {
3221                 talloc_free(arp);
3222                 return;
3223         }
3224
3225         event_add_timed(arp->ctdb->ev, arp, 
3226                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3227                         send_gratious_arp, arp);
3228 }
3229
3230
3231 /*
3232   send a gratious arp 
3233  */
3234 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3235 {
3236         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3237         struct control_gratious_arp *arp;
3238
3239         /* verify the size of indata */
3240         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3241                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3242                                  (unsigned)indata.dsize, 
3243                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3244                 return -1;
3245         }
3246         if (indata.dsize != 
3247                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3248                 + gratious_arp->len ) ){
3249
3250                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3251                         "but should be %u bytes\n", 
3252                          (unsigned)indata.dsize, 
3253                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3254                 return -1;
3255         }
3256
3257
3258         arp = talloc(ctdb, struct control_gratious_arp);
3259         CTDB_NO_MEMORY(ctdb, arp);
3260
3261         arp->ctdb  = ctdb;
3262         arp->addr   = gratious_arp->addr;
3263         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3264         CTDB_NO_MEMORY(ctdb, arp->iface);
3265         arp->count = 0;
3266         
3267         event_add_timed(arp->ctdb->ev, arp, 
3268                         timeval_zero(), send_gratious_arp, arp);
3269
3270         return 0;
3271 }
3272
3273 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3274 {
3275         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3276         int ret;
3277
3278         /* verify the size of indata */
3279         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3280                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3281                 return -1;
3282         }
3283         if (indata.dsize != 
3284                 ( offsetof(struct ctdb_control_ip_iface, iface)
3285                 + pub->len ) ){
3286
3287                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3288                         "but should be %u bytes\n", 
3289                          (unsigned)indata.dsize, 
3290                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3291                 return -1;
3292         }
3293
3294         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3295
3296         if (ret != 0) {
3297                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3298                 return -1;
3299         }
3300
3301         return 0;
3302 }
3303
3304 /*
3305   called when releaseip event finishes for del_public_address
3306  */
3307 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3308                                 void *private_data)
3309 {
3310         talloc_free(private_data);
3311 }
3312
3313 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3314 {
3315         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3316         struct ctdb_vnn *vnn;
3317         int ret;
3318
3319         /* verify the size of indata */
3320         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3321                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3322                 return -1;
3323         }
3324         if (indata.dsize != 
3325                 ( offsetof(struct ctdb_control_ip_iface, iface)
3326                 + pub->len ) ){
3327
3328                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3329                         "but should be %u bytes\n", 
3330                          (unsigned)indata.dsize, 
3331                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3332                 return -1;
3333         }
3334
3335         /* walk over all public addresses until we find a match */
3336         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3337                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3338                         TALLOC_CTX *mem_ctx;
3339
3340                         DLIST_REMOVE(ctdb->vnn, vnn);
3341                         if (vnn->pnn != ctdb->pnn) {
3342                                 if (vnn->iface != NULL) {
3343                                         ctdb_vnn_unassign_iface(ctdb, vnn);
3344                                 }
3345                                 talloc_free(vnn);
3346                                 return 0;
3347                         }
3348                         vnn->pnn = -1;
3349
3350                         mem_ctx = talloc_new(ctdb);
3351                         talloc_steal(mem_ctx, vnn);
3352                         ret = ctdb_event_script_callback(ctdb, 
3353                                          mem_ctx, delete_ip_callback, mem_ctx,
3354                                          false,
3355                                          CTDB_EVENT_RELEASE_IP,
3356                                          "%s %s %u",
3357                                          ctdb_vnn_iface_string(vnn),
3358                                          ctdb_addr_to_str(&vnn->public_address),
3359                                          vnn->public_netmask_bits);
3360                         if (vnn->iface != NULL) {
3361                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3362                         }
3363                         if (ret != 0) {
3364                                 return -1;
3365                         }
3366                         return 0;
3367                 }
3368         }
3369
3370         return -1;
3371 }
3372
3373 /* This function is called from the recovery daemon to verify that a remote
3374    node has the expected ip allocation.
3375    This is verified against ctdb->ip_tree
3376 */
3377 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3378 {
3379         struct ctdb_public_ip_list *tmp_ip; 
3380         int i;
3381
3382         if (ctdb->ip_tree == NULL) {
3383                 /* dont know the expected allocation yet, assume remote node
3384                    is correct. */
3385                 return 0;
3386         }
3387
3388         if (ips == NULL) {
3389                 return 0;
3390         }
3391
3392         for (i=0; i<ips->num; i++) {
3393                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3394                 if (tmp_ip == NULL) {
3395                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3396                         return -1;
3397                 }
3398
3399                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3400                         continue;
3401                 }
3402
3403                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3404                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3405                         return -1;
3406                 }
3407         }
3408
3409         return 0;
3410 }
3411
3412 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3413 {
3414         struct ctdb_public_ip_list *tmp_ip; 
3415
3416         if (ctdb->ip_tree == NULL) {
3417                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3418                 return -1;
3419         }
3420
3421         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3422         if (tmp_ip == NULL) {
3423                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3424                 return -1;
3425         }
3426
3427         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3428         tmp_ip->pnn = ip->pnn;
3429
3430         return 0;
3431 }