ReadOnly: Change the ctdb_db structure to keep a uint8_t for flags instead of a boole...
[sahlberg/ctdb.git] / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "includes.h"
22 #include "lib/tevent/tevent.h"
23 #include "lib/tdb/include/tdb.h"
24 #include "lib/util/dlinklist.h"
25 #include "system/network.h"
26 #include "system/filesys.h"
27 #include "system/wait.h"
28 #include "../include/ctdb_private.h"
29 #include "../common/rb_tree.h"
30
31
32 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33
34 #define CTDB_ARP_INTERVAL 1
35 #define CTDB_ARP_REPEAT   3
36
37 struct ctdb_iface {
38         struct ctdb_iface *prev, *next;
39         const char *name;
40         bool link_up;
41         uint32_t references;
42 };
43
44 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
45 {
46         if (vnn->iface) {
47                 return vnn->iface->name;
48         }
49
50         return "__none__";
51 }
52
53 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
54 {
55         struct ctdb_iface *i;
56
57         /* Verify that we dont have an entry for this ip yet */
58         for (i=ctdb->ifaces;i;i=i->next) {
59                 if (strcmp(i->name, iface) == 0) {
60                         return 0;
61                 }
62         }
63
64         /* create a new structure for this interface */
65         i = talloc_zero(ctdb, struct ctdb_iface);
66         CTDB_NO_MEMORY_FATAL(ctdb, i);
67         i->name = talloc_strdup(i, iface);
68         CTDB_NO_MEMORY(ctdb, i->name);
69         i->link_up = false;
70
71         DLIST_ADD(ctdb->ifaces, i);
72
73         return 0;
74 }
75
76 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
77                                           const char *iface)
78 {
79         struct ctdb_iface *i;
80
81         /* Verify that we dont have an entry for this ip yet */
82         for (i=ctdb->ifaces;i;i=i->next) {
83                 if (strcmp(i->name, iface) == 0) {
84                         return i;
85                 }
86         }
87
88         return NULL;
89 }
90
91 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
92                                               struct ctdb_vnn *vnn)
93 {
94         int i;
95         struct ctdb_iface *cur = NULL;
96         struct ctdb_iface *best = NULL;
97
98         for (i=0; vnn->ifaces[i]; i++) {
99
100                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
101                 if (cur == NULL) {
102                         continue;
103                 }
104
105                 if (!cur->link_up) {
106                         continue;
107                 }
108
109                 if (best == NULL) {
110                         best = cur;
111                         continue;
112                 }
113
114                 if (cur->references < best->references) {
115                         best = cur;
116                         continue;
117                 }
118         }
119
120         return best;
121 }
122
123 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
124                                      struct ctdb_vnn *vnn)
125 {
126         struct ctdb_iface *best = NULL;
127
128         if (vnn->iface) {
129                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
130                                    "still assigned to iface '%s'\n",
131                                    ctdb_addr_to_str(&vnn->public_address),
132                                    ctdb_vnn_iface_string(vnn)));
133                 return 0;
134         }
135
136         best = ctdb_vnn_best_iface(ctdb, vnn);
137         if (best == NULL) {
138                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
139                                   "cannot assign to iface any iface\n",
140                                   ctdb_addr_to_str(&vnn->public_address)));
141                 return -1;
142         }
143
144         vnn->iface = best;
145         best->references++;
146         vnn->pnn = ctdb->pnn;
147
148         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
149                            "now assigned to iface '%s' refs[%d]\n",
150                            ctdb_addr_to_str(&vnn->public_address),
151                            ctdb_vnn_iface_string(vnn),
152                            best->references));
153         return 0;
154 }
155
156 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
157                                     struct ctdb_vnn *vnn)
158 {
159         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
160                            "now unassigned (old iface '%s' refs[%d])\n",
161                            ctdb_addr_to_str(&vnn->public_address),
162                            ctdb_vnn_iface_string(vnn),
163                            vnn->iface?vnn->iface->references:0));
164         if (vnn->iface) {
165                 vnn->iface->references--;
166         }
167         vnn->iface = NULL;
168         if (vnn->pnn == ctdb->pnn) {
169                 vnn->pnn = -1;
170         }
171 }
172
173 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
174                                struct ctdb_vnn *vnn)
175 {
176         int i;
177
178         if (vnn->iface && vnn->iface->link_up) {
179                 return true;
180         }
181
182         for (i=0; vnn->ifaces[i]; i++) {
183                 struct ctdb_iface *cur;
184
185                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186                 if (cur == NULL) {
187                         continue;
188                 }
189
190                 if (cur->link_up) {
191                         return true;
192                 }
193         }
194
195         return false;
196 }
197
198 struct ctdb_takeover_arp {
199         struct ctdb_context *ctdb;
200         uint32_t count;
201         ctdb_sock_addr addr;
202         struct ctdb_tcp_array *tcparray;
203         struct ctdb_vnn *vnn;
204 };
205
206
207 /*
208   lists of tcp endpoints
209  */
210 struct ctdb_tcp_list {
211         struct ctdb_tcp_list *prev, *next;
212         struct ctdb_tcp_connection connection;
213 };
214
215 /*
216   list of clients to kill on IP release
217  */
218 struct ctdb_client_ip {
219         struct ctdb_client_ip *prev, *next;
220         struct ctdb_context *ctdb;
221         ctdb_sock_addr addr;
222         uint32_t client_id;
223 };
224
225
226 /*
227   send a gratuitous arp
228  */
229 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
230                                   struct timeval t, void *private_data)
231 {
232         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
233                                                         struct ctdb_takeover_arp);
234         int i, ret;
235         struct ctdb_tcp_array *tcparray;
236         const char *iface = ctdb_vnn_iface_string(arp->vnn);
237
238         ret = ctdb_sys_send_arp(&arp->addr, iface);
239         if (ret != 0) {
240                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
241                                   iface, strerror(errno)));
242         }
243
244         tcparray = arp->tcparray;
245         if (tcparray) {
246                 for (i=0;i<tcparray->num;i++) {
247                         struct ctdb_tcp_connection *tcon;
248
249                         tcon = &tcparray->connections[i];
250                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
251                                 (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
252                                 ctdb_addr_to_str(&tcon->src_addr),
253                                 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
254                         ret = ctdb_sys_send_tcp(
255                                 &tcon->src_addr, 
256                                 &tcon->dst_addr,
257                                 0, 0, 0);
258                         if (ret != 0) {
259                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
260                                         ctdb_addr_to_str(&tcon->src_addr)));
261                         }
262                 }
263         }
264
265         arp->count++;
266
267         if (arp->count == CTDB_ARP_REPEAT) {
268                 talloc_free(arp);
269                 return;
270         }
271
272         event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
273                         timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
274                         ctdb_control_send_arp, arp);
275 }
276
277 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
278                                        struct ctdb_vnn *vnn)
279 {
280         struct ctdb_takeover_arp *arp;
281         struct ctdb_tcp_array *tcparray;
282
283         if (!vnn->takeover_ctx) {
284                 vnn->takeover_ctx = talloc_new(vnn);
285                 if (!vnn->takeover_ctx) {
286                         return -1;
287                 }
288         }
289
290         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
291         if (!arp) {
292                 return -1;
293         }
294
295         arp->ctdb = ctdb;
296         arp->addr = vnn->public_address;
297         arp->vnn  = vnn;
298
299         tcparray = vnn->tcp_array;
300         if (tcparray) {
301                 /* add all of the known tcp connections for this IP to the
302                    list of tcp connections to send tickle acks for */
303                 arp->tcparray = talloc_steal(arp, tcparray);
304
305                 vnn->tcp_array = NULL;
306                 vnn->tcp_update_needed = true;
307         }
308
309         event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
310                         timeval_zero(), ctdb_control_send_arp, arp);
311
312         return 0;
313 }
314
315 struct takeover_callback_state {
316         struct ctdb_req_control *c;
317         ctdb_sock_addr *addr;
318         struct ctdb_vnn *vnn;
319 };
320
321 struct ctdb_do_takeip_state {
322         struct ctdb_req_control *c;
323         struct ctdb_vnn *vnn;
324 };
325
326 /*
327   called when takeip event finishes
328  */
329 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
330                                     void *private_data)
331 {
332         struct ctdb_do_takeip_state *state =
333                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
334         int32_t ret;
335         TDB_DATA data;
336
337         if (status != 0) {
338                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
339         
340                 if (status == -ETIME) {
341                         ctdb_ban_self(ctdb);
342                 }
343                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
344                                  ctdb_addr_to_str(&state->vnn->public_address),
345                                  ctdb_vnn_iface_string(state->vnn)));
346                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
347
348                 node->flags |= NODE_FLAGS_UNHEALTHY;
349                 talloc_free(state);
350                 return;
351         }
352
353         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
354         if (ret != 0) {
355                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
356                 talloc_free(state);
357                 return;
358         }
359
360         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
361         data.dsize = strlen((char *)data.dptr) + 1;
362         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
363
364         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
365
366
367         /* the control succeeded */
368         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
369         talloc_free(state);
370         return;
371 }
372
373 /*
374   take over an ip address
375  */
376 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
377                               struct ctdb_req_control *c,
378                               struct ctdb_vnn *vnn)
379 {
380         int ret;
381         struct ctdb_do_takeip_state *state;
382
383         ret = ctdb_vnn_assign_iface(ctdb, vnn);
384         if (ret != 0) {
385                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
386                                  "assin a usable interface\n",
387                                  ctdb_addr_to_str(&vnn->public_address),
388                                  vnn->public_netmask_bits));
389                 return -1;
390         }
391
392         state = talloc(vnn, struct ctdb_do_takeip_state);
393         CTDB_NO_MEMORY(ctdb, state);
394
395         state->c = talloc_steal(ctdb, c);
396         state->vnn   = vnn;
397
398         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
399                             ctdb_addr_to_str(&vnn->public_address),
400                             vnn->public_netmask_bits,
401                             ctdb_vnn_iface_string(vnn)));
402
403         ret = ctdb_event_script_callback(ctdb,
404                                          state,
405                                          ctdb_do_takeip_callback,
406                                          state,
407                                          false,
408                                          CTDB_EVENT_TAKE_IP,
409                                          "%s %s %u",
410                                          ctdb_vnn_iface_string(vnn),
411                                          ctdb_addr_to_str(&vnn->public_address),
412                                          vnn->public_netmask_bits);
413
414         if (ret != 0) {
415                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
416                         ctdb_addr_to_str(&vnn->public_address),
417                         ctdb_vnn_iface_string(vnn)));
418                 talloc_free(state);
419                 return -1;
420         }
421
422         return 0;
423 }
424
425 struct ctdb_do_updateip_state {
426         struct ctdb_req_control *c;
427         struct ctdb_iface *old;
428         struct ctdb_vnn *vnn;
429 };
430
431 /*
432   called when updateip event finishes
433  */
434 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
435                                       void *private_data)
436 {
437         struct ctdb_do_updateip_state *state =
438                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
439         int32_t ret;
440
441         if (status != 0) {
442                 if (status == -ETIME) {
443                         ctdb_ban_self(ctdb);
444                 }
445                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
446                         ctdb_addr_to_str(&state->vnn->public_address),
447                         state->old->name,
448                         ctdb_vnn_iface_string(state->vnn)));
449
450                 /*
451                  * All we can do is reset the old interface
452                  * and let the next run fix it
453                  */
454                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
455                 state->vnn->iface = state->old;
456                 state->vnn->iface->references++;
457
458                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
459                 talloc_free(state);
460                 return;
461         }
462
463         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
464         if (ret != 0) {
465                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
466                 talloc_free(state);
467                 return;
468         }
469
470         /* the control succeeded */
471         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
472         talloc_free(state);
473         return;
474 }
475
476 /*
477   update (move) an ip address
478  */
479 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
480                                 struct ctdb_req_control *c,
481                                 struct ctdb_vnn *vnn)
482 {
483         int ret;
484         struct ctdb_do_updateip_state *state;
485         struct ctdb_iface *old = vnn->iface;
486         const char *new_name;
487
488         ctdb_vnn_unassign_iface(ctdb, vnn);
489         ret = ctdb_vnn_assign_iface(ctdb, vnn);
490         if (ret != 0) {
491                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
492                                  "assin a usable interface (old iface '%s')\n",
493                                  ctdb_addr_to_str(&vnn->public_address),
494                                  vnn->public_netmask_bits,
495                                  old->name));
496                 return -1;
497         }
498
499         new_name = ctdb_vnn_iface_string(vnn);
500         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
501                 /* A benign update from one interface onto itself.
502                  * no need to run the eventscripts in this case, just return
503                  * success.
504                  */
505                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
506                 return 0;
507         }
508
509         state = talloc(vnn, struct ctdb_do_updateip_state);
510         CTDB_NO_MEMORY(ctdb, state);
511
512         state->c = talloc_steal(ctdb, c);
513         state->old = old;
514         state->vnn = vnn;
515
516         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
517                             "interface %s to %s\n",
518                             ctdb_addr_to_str(&vnn->public_address),
519                             vnn->public_netmask_bits,
520                             old->name,
521                             new_name));
522
523         ret = ctdb_event_script_callback(ctdb,
524                                          state,
525                                          ctdb_do_updateip_callback,
526                                          state,
527                                          false,
528                                          CTDB_EVENT_UPDATE_IP,
529                                          "%s %s %s %u",
530                                          state->old->name,
531                                          new_name,
532                                          ctdb_addr_to_str(&vnn->public_address),
533                                          vnn->public_netmask_bits);
534         if (ret != 0) {
535                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
536                                  ctdb_addr_to_str(&vnn->public_address),
537                                  old->name, new_name));
538                 talloc_free(state);
539                 return -1;
540         }
541
542         return 0;
543 }
544
545 /*
546   Find the vnn of the node that has a public ip address
547   returns -1 if the address is not known as a public address
548  */
549 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
550 {
551         struct ctdb_vnn *vnn;
552
553         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
554                 if (ctdb_same_ip(&vnn->public_address, addr)) {
555                         return vnn;
556                 }
557         }
558
559         return NULL;
560 }
561
562 /*
563   take over an ip address
564  */
565 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
566                                  struct ctdb_req_control *c,
567                                  TDB_DATA indata,
568                                  bool *async_reply)
569 {
570         int ret;
571         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
572         struct ctdb_vnn *vnn;
573         bool have_ip = false;
574         bool do_updateip = false;
575         bool do_takeip = false;
576         struct ctdb_iface *best_iface = NULL;
577
578         if (pip->pnn != ctdb->pnn) {
579                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
580                                  "with pnn %d, but we're node %d\n",
581                                  ctdb_addr_to_str(&pip->addr),
582                                  pip->pnn, ctdb->pnn));
583                 return -1;
584         }
585
586         /* update out vnn list */
587         vnn = find_public_ip_vnn(ctdb, &pip->addr);
588         if (vnn == NULL) {
589                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
590                         ctdb_addr_to_str(&pip->addr)));
591                 return 0;
592         }
593
594         have_ip = ctdb_sys_have_ip(&pip->addr);
595         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
596         if (best_iface == NULL) {
597                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
598                                  "a usable interface (old %s, have_ip %d)\n",
599                                  ctdb_addr_to_str(&vnn->public_address),
600                                  vnn->public_netmask_bits,
601                                  ctdb_vnn_iface_string(vnn),
602                                  have_ip));
603                 return -1;
604         }
605
606         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
607                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
608                 have_ip = false;
609         }
610
611         if (vnn->iface == NULL && have_ip) {
612                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
613                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
614                                  ctdb_addr_to_str(&vnn->public_address)));
615                 return 0;
616         }
617
618         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
619                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
620                                   "and we have it on iface[%s], but it was assigned to node %d"
621                                   "and we are node %d, banning ourself\n",
622                                  ctdb_addr_to_str(&vnn->public_address),
623                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
624                 ctdb_ban_self(ctdb);
625                 return -1;
626         }
627
628         if (vnn->pnn == -1 && have_ip) {
629                 vnn->pnn = ctdb->pnn;
630                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
631                                   "and we already have it on iface[%s], update local daemon\n",
632                                  ctdb_addr_to_str(&vnn->public_address),
633                                   ctdb_vnn_iface_string(vnn)));
634                 return 0;
635         }
636
637         if (vnn->iface) {
638                 if (vnn->iface->link_up) {
639                         /* only move when the rebalance gains something */
640                         if (vnn->iface->references > (best_iface->references + 1)) {
641                                 do_updateip = true;
642                         }
643                 } else if (vnn->iface != best_iface) {
644                         do_updateip = true;
645                 }
646         }
647
648         if (!have_ip) {
649                 if (do_updateip) {
650                         ctdb_vnn_unassign_iface(ctdb, vnn);
651                         do_updateip = false;
652                 }
653                 do_takeip = true;
654         }
655
656         if (do_takeip) {
657                 ret = ctdb_do_takeip(ctdb, c, vnn);
658                 if (ret != 0) {
659                         return -1;
660                 }
661         } else if (do_updateip) {
662                 ret = ctdb_do_updateip(ctdb, c, vnn);
663                 if (ret != 0) {
664                         return -1;
665                 }
666         } else {
667                 /*
668                  * The interface is up and the kernel known the ip
669                  * => do nothing
670                  */
671                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
672                         ctdb_addr_to_str(&pip->addr),
673                         vnn->public_netmask_bits,
674                         ctdb_vnn_iface_string(vnn)));
675                 return 0;
676         }
677
678         /* tell ctdb_control.c that we will be replying asynchronously */
679         *async_reply = true;
680
681         return 0;
682 }
683
684 /*
685   takeover an ip address old v4 style
686  */
687 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
688                                 struct ctdb_req_control *c,
689                                 TDB_DATA indata, 
690                                 bool *async_reply)
691 {
692         TDB_DATA data;
693         
694         data.dsize = sizeof(struct ctdb_public_ip);
695         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
696         CTDB_NO_MEMORY(ctdb, data.dptr);
697         
698         memcpy(data.dptr, indata.dptr, indata.dsize);
699         return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
700 }
701
702 /*
703   kill any clients that are registered with a IP that is being released
704  */
705 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
706 {
707         struct ctdb_client_ip *ip;
708
709         DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
710                 ctdb_addr_to_str(addr)));
711
712         for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
713                 ctdb_sock_addr tmp_addr;
714
715                 tmp_addr = ip->addr;
716                 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
717                         ip->client_id,
718                         ctdb_addr_to_str(&ip->addr)));
719
720                 if (ctdb_same_ip(&tmp_addr, addr)) {
721                         struct ctdb_client *client = ctdb_reqid_find(ctdb, 
722                                                                      ip->client_id, 
723                                                                      struct ctdb_client);
724                         DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
725                                 ip->client_id,
726                                 ctdb_addr_to_str(&ip->addr),
727                                 client->pid));
728
729                         if (client->pid != 0) {
730                                 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
731                                         (unsigned)client->pid,
732                                         ctdb_addr_to_str(addr),
733                                         ip->client_id));
734                                 kill(client->pid, SIGKILL);
735                         }
736                 }
737         }
738 }
739
740 /*
741   called when releaseip event finishes
742  */
743 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
744                                 void *private_data)
745 {
746         struct takeover_callback_state *state = 
747                 talloc_get_type(private_data, struct takeover_callback_state);
748         TDB_DATA data;
749
750         if (status == -ETIME) {
751                 ctdb_ban_self(ctdb);
752         }
753
754         /* send a message to all clients of this node telling them
755            that the cluster has been reconfigured and they should
756            release any sockets on this IP */
757         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
758         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
759         data.dsize = strlen((char *)data.dptr)+1;
760
761         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
762
763         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
764
765         /* kill clients that have registered with this IP */
766         release_kill_clients(ctdb, state->addr);
767
768         ctdb_vnn_unassign_iface(ctdb, state->vnn);
769
770         /* the control succeeded */
771         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
772         talloc_free(state);
773 }
774
775 /*
776   release an ip address
777  */
778 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
779                                 struct ctdb_req_control *c,
780                                 TDB_DATA indata, 
781                                 bool *async_reply)
782 {
783         int ret;
784         struct takeover_callback_state *state;
785         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
786         struct ctdb_vnn *vnn;
787
788         /* update our vnn list */
789         vnn = find_public_ip_vnn(ctdb, &pip->addr);
790         if (vnn == NULL) {
791                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
792                         ctdb_addr_to_str(&pip->addr)));
793                 return 0;
794         }
795         vnn->pnn = pip->pnn;
796
797         /* stop any previous arps */
798         talloc_free(vnn->takeover_ctx);
799         vnn->takeover_ctx = NULL;
800
801         if (!ctdb_sys_have_ip(&pip->addr)) {
802                 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n", 
803                         ctdb_addr_to_str(&pip->addr),
804                         vnn->public_netmask_bits, 
805                         ctdb_vnn_iface_string(vnn)));
806                 ctdb_vnn_unassign_iface(ctdb, vnn);
807                 return 0;
808         }
809
810         if (vnn->iface == NULL) {
811                 DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
812                                  "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
813                                  ctdb_addr_to_str(&vnn->public_address)));
814                 return 0;
815         }
816
817         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
818                 ctdb_addr_to_str(&pip->addr),
819                 vnn->public_netmask_bits, 
820                 ctdb_vnn_iface_string(vnn),
821                 pip->pnn));
822
823         state = talloc(ctdb, struct takeover_callback_state);
824         CTDB_NO_MEMORY(ctdb, state);
825
826         state->c = talloc_steal(state, c);
827         state->addr = talloc(state, ctdb_sock_addr);       
828         CTDB_NO_MEMORY(ctdb, state->addr);
829         *state->addr = pip->addr;
830         state->vnn   = vnn;
831
832         ret = ctdb_event_script_callback(ctdb, 
833                                          state, release_ip_callback, state,
834                                          false,
835                                          CTDB_EVENT_RELEASE_IP,
836                                          "%s %s %u",
837                                          ctdb_vnn_iface_string(vnn),
838                                          ctdb_addr_to_str(&pip->addr),
839                                          vnn->public_netmask_bits);
840         if (ret != 0) {
841                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
842                         ctdb_addr_to_str(&pip->addr),
843                         ctdb_vnn_iface_string(vnn)));
844                 talloc_free(state);
845                 return -1;
846         }
847
848         /* tell the control that we will be reply asynchronously */
849         *async_reply = true;
850         return 0;
851 }
852
853 /*
854   release an ip address old v4 style
855  */
856 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
857                                 struct ctdb_req_control *c,
858                                 TDB_DATA indata, 
859                                 bool *async_reply)
860 {
861         TDB_DATA data;
862         
863         data.dsize = sizeof(struct ctdb_public_ip);
864         data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
865         CTDB_NO_MEMORY(ctdb, data.dptr);
866         
867         memcpy(data.dptr, indata.dptr, indata.dsize);
868         return ctdb_control_release_ip(ctdb, c, data, async_reply);
869 }
870
871
872 static int ctdb_add_public_address(struct ctdb_context *ctdb,
873                                    ctdb_sock_addr *addr,
874                                    unsigned mask, const char *ifaces)
875 {
876         struct ctdb_vnn      *vnn;
877         uint32_t num = 0;
878         char *tmp;
879         const char *iface;
880         int i;
881         int ret;
882
883         /* Verify that we dont have an entry for this ip yet */
884         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
885                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
886                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
887                                 ctdb_addr_to_str(addr)));
888                         return -1;
889                 }               
890         }
891
892         /* create a new vnn structure for this ip address */
893         vnn = talloc_zero(ctdb, struct ctdb_vnn);
894         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
895         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
896         tmp = talloc_strdup(vnn, ifaces);
897         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
898         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
899                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
900                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
901                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
902                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
903                 num++;
904         }
905         talloc_free(tmp);
906         vnn->ifaces[num] = NULL;
907         vnn->public_address      = *addr;
908         vnn->public_netmask_bits = mask;
909         vnn->pnn                 = -1;
910         if (ctdb_sys_have_ip(addr)) {
911                 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
912                 vnn->pnn = ctdb->pnn;
913         }
914
915         for (i=0; vnn->ifaces[i]; i++) {
916                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
917                 if (ret != 0) {
918                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
919                                            "for public_address[%s]\n",
920                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
921                         talloc_free(vnn);
922                         return -1;
923                 }
924                 if (i == 0) {
925                         vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
926                 }
927         }
928
929         DLIST_ADD(ctdb->vnn, vnn);
930
931         return 0;
932 }
933
934 /*
935   setup the event script directory
936 */
937 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
938 {
939         ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
940         CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
941         return 0;
942 }
943
944 /*
945   setup the public address lists from a file
946 */
947 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
948 {
949         char **lines;
950         int nlines;
951         int i;
952
953         lines = file_lines_load(alist, &nlines, ctdb);
954         if (lines == NULL) {
955                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
956                 return -1;
957         }
958         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
959                 nlines--;
960         }
961
962         for (i=0;i<nlines;i++) {
963                 unsigned mask;
964                 ctdb_sock_addr addr;
965                 const char *addrstr;
966                 const char *ifaces;
967                 char *tok, *line;
968
969                 line = lines[i];
970                 while ((*line == ' ') || (*line == '\t')) {
971                         line++;
972                 }
973                 if (*line == '#') {
974                         continue;
975                 }
976                 if (strcmp(line, "") == 0) {
977                         continue;
978                 }
979                 tok = strtok(line, " \t");
980                 addrstr = tok;
981                 tok = strtok(NULL, " \t");
982                 if (tok == NULL) {
983                         if (NULL == ctdb->default_public_interface) {
984                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
985                                          i+1));
986                                 talloc_free(lines);
987                                 return -1;
988                         }
989                         ifaces = ctdb->default_public_interface;
990                 } else {
991                         ifaces = tok;
992                 }
993
994                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
995                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
996                         talloc_free(lines);
997                         return -1;
998                 }
999                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
1000                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1001                         talloc_free(lines);
1002                         return -1;
1003                 }
1004         }
1005
1006         talloc_free(lines);
1007         return 0;
1008 }
1009
1010 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1011                               const char *iface,
1012                               const char *ip)
1013 {
1014         struct ctdb_vnn *svnn;
1015         struct ctdb_iface *cur = NULL;
1016         bool ok;
1017         int ret;
1018
1019         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1020         CTDB_NO_MEMORY(ctdb, svnn);
1021
1022         svnn->ifaces = talloc_array(svnn, const char *, 2);
1023         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1024         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1025         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1026         svnn->ifaces[1] = NULL;
1027
1028         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1029         if (!ok) {
1030                 talloc_free(svnn);
1031                 return -1;
1032         }
1033
1034         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1035         if (ret != 0) {
1036                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1037                                    "for single_ip[%s]\n",
1038                                    svnn->ifaces[0],
1039                                    ctdb_addr_to_str(&svnn->public_address)));
1040                 talloc_free(svnn);
1041                 return -1;
1042         }
1043
1044         /* assume the single public ip interface is initially "good" */
1045         cur = ctdb_find_iface(ctdb, iface);
1046         if (cur == NULL) {
1047                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1048                 return -1;
1049         }
1050         cur->link_up = true;
1051
1052         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1053         if (ret != 0) {
1054                 talloc_free(svnn);
1055                 return -1;
1056         }
1057
1058         ctdb->single_ip_vnn = svnn;
1059         return 0;
1060 }
1061
1062 /* Given a physical node, return the number of
1063    public addresses that is currently assigned to this node.
1064 */
1065 static int node_ip_coverage(struct ctdb_context *ctdb, 
1066         int32_t pnn,
1067         struct ctdb_public_ip_list *ips)
1068 {
1069         int num=0;
1070
1071         for (;ips;ips=ips->next) {
1072                 if (ips->pnn == pnn) {
1073                         num++;
1074                 }
1075         }
1076         return num;
1077 }
1078
1079
1080 /* Check if this is a public ip known to the node, i.e. can that
1081    node takeover this ip ?
1082 */
1083 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn, 
1084                 struct ctdb_public_ip_list *ip)
1085 {
1086         struct ctdb_all_public_ips *public_ips;
1087         int i;
1088
1089         public_ips = ctdb->nodes[pnn]->available_public_ips;
1090
1091         if (public_ips == NULL) {
1092                 return -1;
1093         }
1094
1095         for (i=0;i<public_ips->num;i++) {
1096                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1097                         /* yes, this node can serve this public ip */
1098                         return 0;
1099                 }
1100         }
1101
1102         return -1;
1103 }
1104
1105
1106 /* search the node lists list for a node to takeover this ip.
1107    pick the node that currently are serving the least number of ips
1108    so that the ips get spread out evenly.
1109 */
1110 static int find_takeover_node(struct ctdb_context *ctdb, 
1111                 struct ctdb_node_map *nodemap, uint32_t mask, 
1112                 struct ctdb_public_ip_list *ip,
1113                 struct ctdb_public_ip_list *all_ips)
1114 {
1115         int pnn, min=0, num;
1116         int i;
1117
1118         pnn    = -1;
1119         for (i=0;i<nodemap->num;i++) {
1120                 if (nodemap->nodes[i].flags & mask) {
1121                         /* This node is not healty and can not be used to serve
1122                            a public address 
1123                         */
1124                         continue;
1125                 }
1126
1127                 /* verify that this node can serve this ip */
1128                 if (can_node_serve_ip(ctdb, i, ip)) {
1129                         /* no it couldnt   so skip to the next node */
1130                         continue;
1131                 }
1132
1133                 num = node_ip_coverage(ctdb, i, all_ips);
1134                 /* was this the first node we checked ? */
1135                 if (pnn == -1) {
1136                         pnn = i;
1137                         min  = num;
1138                 } else {
1139                         if (num < min) {
1140                                 pnn = i;
1141                                 min  = num;
1142                         }
1143                 }
1144         }       
1145         if (pnn == -1) {
1146                 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1147                         ctdb_addr_to_str(&ip->addr)));
1148
1149                 return -1;
1150         }
1151
1152         ip->pnn = pnn;
1153         return 0;
1154 }
1155
1156 #define IP_KEYLEN       4
1157 static uint32_t *ip_key(ctdb_sock_addr *ip)
1158 {
1159         static uint32_t key[IP_KEYLEN];
1160
1161         bzero(key, sizeof(key));
1162
1163         switch (ip->sa.sa_family) {
1164         case AF_INET:
1165                 key[3]  = htonl(ip->ip.sin_addr.s_addr);
1166                 break;
1167         case AF_INET6:
1168                 key[0]  = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
1169                 key[1]  = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
1170                 key[2]  = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
1171                 key[3]  = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
1172                 break;
1173         default:
1174                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1175                 return key;
1176         }
1177
1178         return key;
1179 }
1180
1181 static void *add_ip_callback(void *parm, void *data)
1182 {
1183         struct ctdb_public_ip_list *this_ip = parm; 
1184         struct ctdb_public_ip_list *prev_ip = data; 
1185
1186         if (prev_ip == NULL) {
1187                 return parm;
1188         }
1189         if (this_ip->pnn == -1) {
1190                 this_ip->pnn = prev_ip->pnn;
1191         }
1192
1193         return parm;
1194 }
1195
1196 void getips_count_callback(void *param, void *data)
1197 {
1198         struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1199         struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1200
1201         new_ip->next = *ip_list;
1202         *ip_list     = new_ip;
1203 }
1204
1205 static struct ctdb_public_ip_list *
1206 create_merged_ip_list(struct ctdb_context *ctdb)
1207 {
1208         int i, j;
1209         struct ctdb_public_ip_list *ip_list;
1210         struct ctdb_all_public_ips *public_ips;
1211
1212         if (ctdb->ip_tree != NULL) {
1213                 talloc_free(ctdb->ip_tree);
1214                 ctdb->ip_tree = NULL;
1215         }
1216         ctdb->ip_tree = trbt_create(ctdb, 0);
1217
1218         for (i=0;i<ctdb->num_nodes;i++) {
1219                 public_ips = ctdb->nodes[i]->known_public_ips;
1220
1221                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1222                         continue;
1223                 }
1224
1225                 /* there were no public ips for this node */
1226                 if (public_ips == NULL) {
1227                         continue;
1228                 }               
1229
1230                 for (j=0;j<public_ips->num;j++) {
1231                         struct ctdb_public_ip_list *tmp_ip; 
1232
1233                         tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1234                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1235                         tmp_ip->pnn  = public_ips->ips[j].pnn;
1236                         tmp_ip->addr = public_ips->ips[j].addr;
1237                         tmp_ip->next = NULL;
1238
1239                         trbt_insertarray32_callback(ctdb->ip_tree,
1240                                 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1241                                 add_ip_callback,
1242                                 tmp_ip);
1243                 }
1244         }
1245
1246         ip_list = NULL;
1247         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1248
1249         return ip_list;
1250 }
1251
1252 /* 
1253  * This is the length of the longtest common prefix between the IPs.
1254  * It is calculated by XOR-ing the 2 IPs together and counting the
1255  * number of leading zeroes.  The implementation means that all
1256  * addresses end up being 128 bits long.
1257  * Not static, so we can easily link it into a unit test.
1258  *
1259  * FIXME? Should we consider IPv4 and IPv6 separately given that the
1260  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1261  * lots of nodes and IP addresses?
1262  */
1263 uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1264 {
1265         uint32_t ip1_k[IP_KEYLEN];
1266         uint32_t *t;
1267         int i;
1268         uint32_t x;
1269
1270         uint32_t distance = 0;
1271
1272         memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1273         t = ip_key(ip2);
1274         for (i=0; i<IP_KEYLEN; i++) {
1275                 x = ip1_k[i] ^ t[i];
1276                 if (x == 0) {
1277                         distance += 32;
1278                 } else {
1279                         /* Count number of leading zeroes. 
1280                          * FIXME? This could be optimised...
1281                          */
1282                         while ((x & (1 << 31)) == 0) {
1283                                 x <<= 1;
1284                                 distance += 1;
1285                         }
1286                 }
1287         }
1288
1289         return distance;
1290 }
1291
1292 /* Calculate the IP distance for the given IP relative to IPs on the
1293    given node.  The ips argument is generally the all_ips variable
1294    used in the main part of the algorithm.
1295  * Not static, so we can easily link it into a unit test.
1296  */
1297 uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1298                            struct ctdb_public_ip_list *ips,
1299                            int pnn)
1300 {
1301         struct ctdb_public_ip_list *t;
1302         uint32_t d;
1303
1304         uint32_t sum = 0;
1305
1306         for (t=ips; t != NULL; t=t->next) {
1307                 if (t->pnn != pnn) {
1308                         continue;
1309                 }
1310
1311                 /* Optimisation: We never calculate the distance
1312                  * between an address and itself.  This allows us to
1313                  * calculate the effect of removing an address from a
1314                  * node by simply calculating the distance between
1315                  * that address and all of the exitsing addresses.
1316                  * Moreover, we assume that we're only ever dealing
1317                  * with addresses from all_ips so we can identify an
1318                  * address via a pointer rather than doing a more
1319                  * expensive address comparison. */
1320                 if (&(t->addr) == ip) {
1321                         continue;
1322                 }
1323
1324                 d = ip_distance(ip, &(t->addr));
1325                 sum += d * d;  /* Cheaper than pulling in math.h :-) */
1326         }
1327
1328         return sum;
1329 }
1330
1331 /* Return the LCP2 imbalance metric for addresses currently assigned
1332    to the given node.
1333  * Not static, so we can easily link it into a unit test.
1334  */
1335 uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1336 {
1337         struct ctdb_public_ip_list *t;
1338
1339         uint32_t imbalance = 0;
1340
1341         for (t=all_ips; t!=NULL; t=t->next) {
1342                 if (t->pnn != pnn) {
1343                         continue;
1344                 }
1345                 /* Pass the rest of the IPs rather than the whole
1346                    all_ips input list.
1347                 */
1348                 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1349         }
1350
1351         return imbalance;
1352 }
1353
1354 /* Allocate any unassigned IPs just by looping through the IPs and
1355  * finding the best node for each.
1356  * Not static, so we can easily link it into a unit test.
1357  */
1358 void basic_allocate_unassigned(struct ctdb_context *ctdb,
1359                                struct ctdb_node_map *nodemap,
1360                                uint32_t mask,
1361                                struct ctdb_public_ip_list *all_ips)
1362 {
1363         struct ctdb_public_ip_list *tmp_ip;
1364
1365         /* loop over all ip's and find a physical node to cover for 
1366            each unassigned ip.
1367         */
1368         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1369                 if (tmp_ip->pnn == -1) {
1370                         if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1371                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1372                                         ctdb_addr_to_str(&tmp_ip->addr)));
1373                         }
1374                 }
1375         }
1376 }
1377
1378 /* Basic non-deterministic rebalancing algorithm.
1379  * Not static, so we can easily link it into a unit test.
1380  */
1381 bool basic_failback(struct ctdb_context *ctdb,
1382                     struct ctdb_node_map *nodemap,
1383                     uint32_t mask,
1384                     struct ctdb_public_ip_list *all_ips,
1385                     int num_ips,
1386                     int *retries)
1387 {
1388         int i;
1389         int maxnode, maxnum=0, minnode, minnum=0, num;
1390         struct ctdb_public_ip_list *tmp_ip;
1391
1392         /* for each ip address, loop over all nodes that can serve
1393            this ip and make sure that the difference between the node
1394            serving the most and the node serving the least ip's are
1395            not greater than 1.
1396         */
1397         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1398                 if (tmp_ip->pnn == -1) {
1399                         continue;
1400                 }
1401
1402                 /* Get the highest and lowest number of ips's served by any 
1403                    valid node which can serve this ip.
1404                 */
1405                 maxnode = -1;
1406                 minnode = -1;
1407                 for (i=0;i<nodemap->num;i++) {
1408                         if (nodemap->nodes[i].flags & mask) {
1409                                 continue;
1410                         }
1411
1412                         /* only check nodes that can actually serve this ip */
1413                         if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1414                                 /* no it couldnt   so skip to the next node */
1415                                 continue;
1416                         }
1417
1418                         num = node_ip_coverage(ctdb, i, all_ips);
1419                         if (maxnode == -1) {
1420                                 maxnode = i;
1421                                 maxnum  = num;
1422                         } else {
1423                                 if (num > maxnum) {
1424                                         maxnode = i;
1425                                         maxnum  = num;
1426                                 }
1427                         }
1428                         if (minnode == -1) {
1429                                 minnode = i;
1430                                 minnum  = num;
1431                         } else {
1432                                 if (num < minnum) {
1433                                         minnode = i;
1434                                         minnum  = num;
1435                                 }
1436                         }
1437                 }
1438                 if (maxnode == -1) {
1439                         DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1440                                 ctdb_addr_to_str(&tmp_ip->addr)));
1441
1442                         continue;
1443                 }
1444
1445                 /* If we want deterministic IPs then dont try to reallocate 
1446                    them to spread out the load.
1447                 */
1448                 if (1 == ctdb->tunable.deterministic_public_ips) {
1449                         continue;
1450                 }
1451
1452                 /* if the spread between the smallest and largest coverage by
1453                    a node is >=2 we steal one of the ips from the node with
1454                    most coverage to even things out a bit.
1455                    try to do this a limited number of times since we dont
1456                    want to spend too much time balancing the ip coverage.
1457                 */
1458                 if ( (maxnum > minnum+1)
1459                      && (*retries < (num_ips + 5)) ){
1460                         struct ctdb_public_ip_list *tmp;
1461
1462                         /* mark one of maxnode's vnn's as unassigned and try
1463                            again
1464                         */
1465                         for (tmp=all_ips;tmp;tmp=tmp->next) {
1466                                 if (tmp->pnn == maxnode) {
1467                                         tmp->pnn = -1;
1468                                         (*retries)++;
1469                                         return true;
1470                                 }
1471                         }
1472                 }
1473         }
1474
1475         return false;
1476 }
1477
1478 /* Do necessary LCP2 initialisation.  Bury it in a function here so
1479  * that we can unit test it.
1480  * Not static, so we can easily link it into a unit test.
1481  */
1482 void lcp2_init(struct ctdb_context * tmp_ctx,
1483                struct ctdb_node_map * nodemap,
1484                uint32_t mask,
1485                struct ctdb_public_ip_list *all_ips,
1486                uint32_t **lcp2_imbalances,
1487                bool **newly_healthy)
1488 {
1489         int i;
1490         struct ctdb_public_ip_list *tmp_ip;
1491
1492         *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
1493         CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
1494         *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1495         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1496
1497         for (i=0;i<nodemap->num;i++) {
1498                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1499                 /* First step: is the node "healthy"? */
1500                 (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
1501         }
1502
1503         /* 2nd step: if a ndoe has IPs assigned then it must have been
1504          * healthy before, so we remove it from consideration... */
1505         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1506                 if (tmp_ip->pnn != -1) {
1507                         (*newly_healthy)[tmp_ip->pnn] = false;
1508                 }
1509         }
1510 }
1511
1512 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1513  * the IP/node combination that will cost the least.
1514  * Not static, so we can easily link it into a unit test.
1515  */
1516 void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1517                               struct ctdb_node_map *nodemap,
1518                               uint32_t mask,
1519                               struct ctdb_public_ip_list *all_ips,
1520                               uint32_t *lcp2_imbalances)
1521 {
1522         struct ctdb_public_ip_list *tmp_ip;
1523         int dstnode;
1524
1525         int minnode;
1526         uint32_t mindsum, dstdsum, dstimbl, minimbl;
1527         struct ctdb_public_ip_list *minip;
1528
1529         bool should_loop = true;
1530         bool have_unassigned = true;
1531
1532         while (have_unassigned && should_loop) {
1533                 should_loop = false;
1534
1535                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1536                 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1537
1538                 minnode = -1;
1539                 mindsum = 0;
1540                 minip = NULL;
1541
1542                 /* loop over each unassigned ip. */
1543                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1544                         if (tmp_ip->pnn != -1) {
1545                                 continue;
1546                         }
1547
1548                         for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1549                                 /* only check nodes that can actually serve this ip */
1550                                 if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1551                                         /* no it couldnt   so skip to the next node */
1552                                         continue;
1553                                 }
1554                                 if (nodemap->nodes[dstnode].flags & mask) {
1555                                         continue;
1556                                 }
1557
1558                                 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1559                                 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1560                                 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1561                                                    ctdb_addr_to_str(&(tmp_ip->addr)),
1562                                                    dstnode,
1563                                                    dstimbl - lcp2_imbalances[dstnode]));
1564
1565
1566                                 if ((minnode == -1) || (dstdsum < mindsum)) {
1567                                         minnode = dstnode;
1568                                         minimbl = dstimbl;
1569                                         mindsum = dstdsum;
1570                                         minip = tmp_ip;
1571                                         should_loop = true;
1572                                 }
1573                         }
1574                 }
1575
1576                 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1577
1578                 /* If we found one then assign it to the given node. */
1579                 if (minnode != -1) {
1580                         minip->pnn = minnode;
1581                         lcp2_imbalances[minnode] = minimbl;
1582                         DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1583                                           ctdb_addr_to_str(&(minip->addr)),
1584                                           minnode,
1585                                           mindsum));
1586                 }
1587
1588                 /* There might be a better way but at least this is clear. */
1589                 have_unassigned = false;
1590                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1591                         if (tmp_ip->pnn == -1) {
1592                                 have_unassigned = true;
1593                         }
1594                 }
1595         }
1596
1597         /* We know if we have an unassigned addresses so we might as
1598          * well optimise.
1599          */
1600         if (have_unassigned) {
1601                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1602                         if (tmp_ip->pnn == -1) {
1603                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1604                                                      ctdb_addr_to_str(&tmp_ip->addr)));
1605                         }
1606                 }
1607         }
1608 }
1609
1610 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
1611  * node with the highest LCP2 imbalance, and then determines the best
1612  * IP/destination node combination to move from the source node.
1613  *
1614  * Not static, so we can easily link it into a unit test.
1615  */
1616 bool lcp2_failback(struct ctdb_context *ctdb,
1617                    struct ctdb_node_map *nodemap,
1618                    uint32_t mask,
1619                    struct ctdb_public_ip_list *all_ips,
1620                    uint32_t *lcp2_imbalances,
1621                    bool *newly_healthy)
1622 {
1623         int srcnode, dstnode, mindstnode, i, num_newly_healthy;
1624         uint32_t srcimbl, srcdsum, maximbl, dstimbl, dstdsum;
1625         uint32_t minsrcimbl, mindstimbl, b;
1626         struct ctdb_public_ip_list *minip;
1627         struct ctdb_public_ip_list *tmp_ip;
1628
1629         /* It is only worth continuing if we have suitable target
1630          * nodes to transfer IPs to.  This check is much cheaper than
1631          * continuing on...
1632          */
1633         num_newly_healthy = 0;
1634         for (i = 0; i < nodemap->num; i++) {
1635                 if (newly_healthy[i]) {
1636                         num_newly_healthy++;
1637                 }
1638         }
1639         if (num_newly_healthy == 0) {
1640                 return false;
1641         }
1642
1643         /* Get the node with the highest imbalance metric. */
1644         srcnode = -1;
1645         maximbl = 0;
1646         for (i=0; i < nodemap->num; i++) {
1647                 b = lcp2_imbalances[i];
1648                 if ((srcnode == -1) || (b > maximbl)) {
1649                         srcnode = i;
1650                         maximbl = b;
1651                 }
1652         }
1653
1654         /* This means that all nodes had 0 or 1 addresses, so can't be
1655          * imbalanced.
1656          */
1657         if (maximbl == 0) {
1658                 return false;
1659         }
1660
1661         /* Find an IP and destination node that best reduces imbalance. */
1662         minip = NULL;
1663         minsrcimbl = 0;
1664         mindstnode = -1;
1665         mindstimbl = 0;
1666
1667         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1668         DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, maximbl));
1669
1670         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1671                 /* Only consider addresses on srcnode. */
1672                 if (tmp_ip->pnn != srcnode) {
1673                         continue;
1674                 }
1675
1676                 /* What is this IP address costing the source node? */
1677                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1678                 srcimbl = maximbl - srcdsum;
1679
1680                 /* Consider this IP address would cost each potential
1681                  * destination node.  Destination nodes are limited to
1682                  * those that are newly healthy, since we don't want
1683                  * to do gratuitous failover of IPs just to make minor
1684                  * balance improvements.
1685                  */
1686                 for (dstnode=0; dstnode < nodemap->num; dstnode++) {
1687                         if (! newly_healthy[dstnode]) {
1688                                 continue;
1689                         }
1690                         /* only check nodes that can actually serve this ip */
1691                         if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
1692                                 /* no it couldnt   so skip to the next node */
1693                                 continue;
1694                         }
1695
1696                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1697                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1698                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1699                                            srcnode, srcimbl - lcp2_imbalances[srcnode],
1700                                            ctdb_addr_to_str(&(tmp_ip->addr)),
1701                                            dstnode, dstimbl - lcp2_imbalances[dstnode]));
1702
1703                         if ((dstimbl < maximbl) && (dstdsum < srcdsum) && \
1704                             ((mindstnode == -1) ||                              \
1705                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1706
1707                                 minip = tmp_ip;
1708                                 minsrcimbl = srcimbl;
1709                                 mindstnode = dstnode;
1710                                 mindstimbl = dstimbl;
1711                         }
1712                 }
1713         }
1714         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1715
1716         if (mindstnode != -1) {
1717                 /* We found a move that makes things better... */
1718                 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1719                                   srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1720                                   ctdb_addr_to_str(&(minip->addr)),
1721                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1722
1723
1724                 lcp2_imbalances[srcnode] = srcimbl;
1725                 lcp2_imbalances[mindstnode] = mindstimbl;
1726                 minip->pnn = mindstnode;
1727
1728                 return true;
1729         }
1730
1731         return false;
1732         
1733 }
1734
1735 /* The calculation part of the IP allocation algorithm.
1736  * Not static, so we can easily link it into a unit test.
1737  */
1738 void ctdb_takeover_run_core(struct ctdb_context *ctdb,
1739                             struct ctdb_node_map *nodemap,
1740                             struct ctdb_public_ip_list **all_ips_p)
1741 {
1742         int i, num_healthy, retries, num_ips;
1743         uint32_t mask;
1744         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1745         uint32_t *lcp2_imbalances;
1746         bool *newly_healthy;
1747
1748         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1749
1750         /* Count how many completely healthy nodes we have */
1751         num_healthy = 0;
1752         for (i=0;i<nodemap->num;i++) {
1753                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1754                         num_healthy++;
1755                 }
1756         }
1757
1758         if (num_healthy > 0) {
1759                 /* We have healthy nodes, so only consider them for 
1760                    serving public addresses
1761                 */
1762                 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1763         } else {
1764                 /* We didnt have any completely healthy nodes so
1765                    use "disabled" nodes as a fallback
1766                 */
1767                 mask = NODE_FLAGS_INACTIVE;
1768         }
1769
1770         /* since nodes only know about those public addresses that
1771            can be served by that particular node, no single node has
1772            a full list of all public addresses that exist in the cluster.
1773            Walk over all node structures and create a merged list of
1774            all public addresses that exist in the cluster.
1775
1776            keep the tree of ips around as ctdb->ip_tree
1777         */
1778         all_ips = create_merged_ip_list(ctdb);
1779         *all_ips_p = all_ips; /* minimal code changes */
1780
1781         /* Count how many ips we have */
1782         num_ips = 0;
1783         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1784                 num_ips++;
1785         }
1786
1787         /* If we want deterministic ip allocations, i.e. that the ip addresses
1788            will always be allocated the same way for a specific set of
1789            available/unavailable nodes.
1790         */
1791         if (1 == ctdb->tunable.deterministic_public_ips) {              
1792                 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1793                 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1794                         tmp_ip->pnn = i%nodemap->num;
1795                 }
1796         }
1797
1798
1799         /* mark all public addresses with a masked node as being served by
1800            node -1
1801         */
1802         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1803                 if (tmp_ip->pnn == -1) {
1804                         continue;
1805                 }
1806                 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1807                         tmp_ip->pnn = -1;
1808                 }
1809         }
1810
1811         /* verify that the assigned nodes can serve that public ip
1812            and set it to -1 if not
1813         */
1814         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1815                 if (tmp_ip->pnn == -1) {
1816                         continue;
1817                 }
1818                 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1819                         /* this node can not serve this ip. */
1820                         tmp_ip->pnn = -1;
1821                 }
1822         }
1823
1824         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1825                 lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
1826         }
1827
1828         /* now we must redistribute all public addresses with takeover node
1829            -1 among the nodes available
1830         */
1831         retries = 0;
1832 try_again:
1833         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1834                 lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
1835         } else {
1836                 basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
1837         }
1838
1839         /* If we dont want ips to fail back after a node becomes healthy
1840            again, we wont even try to reallocat the ip addresses so that
1841            they are evenly spread out.
1842            This can NOT be used at the same time as DeterministicIPs !
1843         */
1844         if (1 == ctdb->tunable.no_ip_failback) {
1845                 if (1 == ctdb->tunable.deterministic_public_ips) {
1846                         DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1847                 }
1848                 goto finished;
1849         }
1850
1851
1852         /* now, try to make sure the ip adresses are evenly distributed
1853            across the node.
1854         */
1855         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1856                 if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
1857                         goto try_again;
1858                 }
1859         } else {
1860                 if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
1861                         goto try_again;
1862                 }
1863         }
1864
1865         /* finished distributing the public addresses, now just send the 
1866            info out to the nodes
1867         */
1868 finished:
1869
1870         /* at this point ->pnn is the node which will own each IP
1871            or -1 if there is no node that can cover this ip
1872         */
1873
1874         return;
1875 }
1876
1877 /*
1878   make any IP alias changes for public addresses that are necessary 
1879  */
1880 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1881 {
1882         int i;
1883         struct ctdb_public_ip ip;
1884         struct ctdb_public_ipv4 ipv4;
1885         uint32_t *nodes;
1886         struct ctdb_public_ip_list *all_ips, *tmp_ip;
1887         TDB_DATA data;
1888         struct timeval timeout;
1889         struct client_async_data *async_data;
1890         struct ctdb_client_control_state *state;
1891         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1892
1893         /*
1894          * ip failover is completely disabled, just send out the 
1895          * ipreallocated event.
1896          */
1897         if (ctdb->tunable.disable_ip_failover != 0) {
1898                 goto ipreallocated;
1899         }
1900
1901         ZERO_STRUCT(ip);
1902
1903         /* Do the IP reassignment calculations */
1904         ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
1905
1906         /* now tell all nodes to delete any alias that they should not
1907            have.  This will be a NOOP on nodes that don't currently
1908            hold the given alias */
1909         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1910         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1911
1912         for (i=0;i<nodemap->num;i++) {
1913                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1914                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1915                         continue;
1916                 }
1917
1918                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1919                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1920                                 /* This node should be serving this
1921                                    vnn so dont tell it to release the ip
1922                                 */
1923                                 continue;
1924                         }
1925                         if (tmp_ip->addr.sa.sa_family == AF_INET) {
1926                                 ipv4.pnn = tmp_ip->pnn;
1927                                 ipv4.sin = tmp_ip->addr.ip;
1928
1929                                 timeout = TAKEOVER_TIMEOUT();
1930                                 data.dsize = sizeof(ipv4);
1931                                 data.dptr  = (uint8_t *)&ipv4;
1932                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1933                                                 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1934                                                 data, async_data,
1935                                                 &timeout, NULL);
1936                         } else {
1937                                 ip.pnn  = tmp_ip->pnn;
1938                                 ip.addr = tmp_ip->addr;
1939
1940                                 timeout = TAKEOVER_TIMEOUT();
1941                                 data.dsize = sizeof(ip);
1942                                 data.dptr  = (uint8_t *)&ip;
1943                                 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1944                                                 0, CTDB_CONTROL_RELEASE_IP, 0,
1945                                                 data, async_data,
1946                                                 &timeout, NULL);
1947                         }
1948
1949                         if (state == NULL) {
1950                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1951                                 talloc_free(tmp_ctx);
1952                                 return -1;
1953                         }
1954                 
1955                         ctdb_client_async_add(async_data, state);
1956                 }
1957         }
1958         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1959                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1960                 talloc_free(tmp_ctx);
1961                 return -1;
1962         }
1963         talloc_free(async_data);
1964
1965
1966         /* tell all nodes to get their own IPs */
1967         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1968         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1969         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1970                 if (tmp_ip->pnn == -1) {
1971                         /* this IP won't be taken over */
1972                         continue;
1973                 }
1974
1975                 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1976                         ipv4.pnn = tmp_ip->pnn;
1977                         ipv4.sin = tmp_ip->addr.ip;
1978
1979                         timeout = TAKEOVER_TIMEOUT();
1980                         data.dsize = sizeof(ipv4);
1981                         data.dptr  = (uint8_t *)&ipv4;
1982                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1983                                         0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1984                                         data, async_data,
1985                                         &timeout, NULL);
1986                 } else {
1987                         ip.pnn  = tmp_ip->pnn;
1988                         ip.addr = tmp_ip->addr;
1989
1990                         timeout = TAKEOVER_TIMEOUT();
1991                         data.dsize = sizeof(ip);
1992                         data.dptr  = (uint8_t *)&ip;
1993                         state = ctdb_control_send(ctdb, tmp_ip->pnn,
1994                                         0, CTDB_CONTROL_TAKEOVER_IP, 0,
1995                                         data, async_data,
1996                                         &timeout, NULL);
1997                 }
1998                 if (state == NULL) {
1999                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2000                         talloc_free(tmp_ctx);
2001                         return -1;
2002                 }
2003                 
2004                 ctdb_client_async_add(async_data, state);
2005         }
2006         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2007                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2008                 talloc_free(tmp_ctx);
2009                 return -1;
2010         }
2011
2012 ipreallocated:
2013         /* tell all nodes to update natwg */
2014         /* send the flags update natgw on all connected nodes */
2015         data.dptr  = discard_const("ipreallocated");
2016         data.dsize = strlen((char *)data.dptr) + 1; 
2017         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2018         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
2019                                       nodes, 0, TAKEOVER_TIMEOUT(),
2020                                       false, data,
2021                                       NULL, NULL,
2022                                       NULL) != 0) {
2023                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
2024         }
2025
2026         talloc_free(tmp_ctx);
2027         return 0;
2028 }
2029
2030
2031 /*
2032   destroy a ctdb_client_ip structure
2033  */
2034 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2035 {
2036         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2037                 ctdb_addr_to_str(&ip->addr),
2038                 ntohs(ip->addr.ip.sin_port),
2039                 ip->client_id));
2040
2041         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2042         return 0;
2043 }
2044
2045 /*
2046   called by a client to inform us of a TCP connection that it is managing
2047   that should tickled with an ACK when IP takeover is done
2048   we handle both the old ipv4 style of packets as well as the new ipv4/6
2049   pdus.
2050  */
2051 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2052                                 TDB_DATA indata)
2053 {
2054         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2055         struct ctdb_control_tcp *old_addr = NULL;
2056         struct ctdb_control_tcp_addr new_addr;
2057         struct ctdb_control_tcp_addr *tcp_sock = NULL;
2058         struct ctdb_tcp_list *tcp;
2059         struct ctdb_tcp_connection t;
2060         int ret;
2061         TDB_DATA data;
2062         struct ctdb_client_ip *ip;
2063         struct ctdb_vnn *vnn;
2064         ctdb_sock_addr addr;
2065
2066         switch (indata.dsize) {
2067         case sizeof(struct ctdb_control_tcp):
2068                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2069                 ZERO_STRUCT(new_addr);
2070                 tcp_sock = &new_addr;
2071                 tcp_sock->src.ip  = old_addr->src;
2072                 tcp_sock->dest.ip = old_addr->dest;
2073                 break;
2074         case sizeof(struct ctdb_control_tcp_addr):
2075                 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2076                 break;
2077         default:
2078                 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2079                                  "to ctdb_control_tcp_client. size was %d but "
2080                                  "only allowed sizes are %lu and %lu\n",
2081                                  (int)indata.dsize,
2082                                  (long unsigned)sizeof(struct ctdb_control_tcp),
2083                                  (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2084                 return -1;
2085         }
2086
2087         addr = tcp_sock->src;
2088         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2089         addr = tcp_sock->dest;
2090         ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2091
2092         ZERO_STRUCT(addr);
2093         memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2094         vnn = find_public_ip_vnn(ctdb, &addr);
2095         if (vnn == NULL) {
2096                 switch (addr.sa.sa_family) {
2097                 case AF_INET:
2098                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2099                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2100                                         ctdb_addr_to_str(&addr)));
2101                         }
2102                         break;
2103                 case AF_INET6:
2104                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2105                                 ctdb_addr_to_str(&addr)));
2106                         break;
2107                 default:
2108                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2109                 }
2110
2111                 return 0;
2112         }
2113
2114         if (vnn->pnn != ctdb->pnn) {
2115                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2116                         ctdb_addr_to_str(&addr),
2117                         client_id, client->pid));
2118                 /* failing this call will tell smbd to die */
2119                 return -1;
2120         }
2121
2122         ip = talloc(client, struct ctdb_client_ip);
2123         CTDB_NO_MEMORY(ctdb, ip);
2124
2125         ip->ctdb      = ctdb;
2126         ip->addr      = addr;
2127         ip->client_id = client_id;
2128         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2129         DLIST_ADD(ctdb->client_ip_list, ip);
2130
2131         tcp = talloc(client, struct ctdb_tcp_list);
2132         CTDB_NO_MEMORY(ctdb, tcp);
2133
2134         tcp->connection.src_addr = tcp_sock->src;
2135         tcp->connection.dst_addr = tcp_sock->dest;
2136
2137         DLIST_ADD(client->tcp_list, tcp);
2138
2139         t.src_addr = tcp_sock->src;
2140         t.dst_addr = tcp_sock->dest;
2141
2142         data.dptr = (uint8_t *)&t;
2143         data.dsize = sizeof(t);
2144
2145         switch (addr.sa.sa_family) {
2146         case AF_INET:
2147                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2148                         (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
2149                         ctdb_addr_to_str(&tcp_sock->src),
2150                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2151                 break;
2152         case AF_INET6:
2153                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2154                         (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
2155                         ctdb_addr_to_str(&tcp_sock->src),
2156                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2157                 break;
2158         default:
2159                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2160         }
2161
2162
2163         /* tell all nodes about this tcp connection */
2164         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2165                                        CTDB_CONTROL_TCP_ADD,
2166                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2167         if (ret != 0) {
2168                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2169                 return -1;
2170         }
2171
2172         return 0;
2173 }
2174
2175 /*
2176   find a tcp address on a list
2177  */
2178 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
2179                                            struct ctdb_tcp_connection *tcp)
2180 {
2181         int i;
2182
2183         if (array == NULL) {
2184                 return NULL;
2185         }
2186
2187         for (i=0;i<array->num;i++) {
2188                 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2189                     ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2190                         return &array->connections[i];
2191                 }
2192         }
2193         return NULL;
2194 }
2195
2196
2197
2198 /*
2199   called by a daemon to inform us of a TCP connection that one of its
2200   clients managing that should tickled with an ACK when IP takeover is
2201   done
2202  */
2203 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2204 {
2205         struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2206         struct ctdb_tcp_array *tcparray;
2207         struct ctdb_tcp_connection tcp;
2208         struct ctdb_vnn *vnn;
2209
2210         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2211         if (vnn == NULL) {
2212                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2213                         ctdb_addr_to_str(&p->dst_addr)));
2214
2215                 return -1;
2216         }
2217
2218
2219         tcparray = vnn->tcp_array;
2220
2221         /* If this is the first tickle */
2222         if (tcparray == NULL) {
2223                 tcparray = talloc_size(ctdb->nodes, 
2224                         offsetof(struct ctdb_tcp_array, connections) +
2225                         sizeof(struct ctdb_tcp_connection) * 1);
2226                 CTDB_NO_MEMORY(ctdb, tcparray);
2227                 vnn->tcp_array = tcparray;
2228
2229                 tcparray->num = 0;
2230                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2231                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2232
2233                 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2234                 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2235                 tcparray->num++;
2236
2237                 if (tcp_update_needed) {
2238                         vnn->tcp_update_needed = true;
2239                 }
2240                 return 0;
2241         }
2242
2243
2244         /* Do we already have this tickle ?*/
2245         tcp.src_addr = p->src_addr;
2246         tcp.dst_addr = p->dst_addr;
2247         if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2248                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2249                         ctdb_addr_to_str(&tcp.dst_addr),
2250                         ntohs(tcp.dst_addr.ip.sin_port),
2251                         vnn->pnn));
2252                 return 0;
2253         }
2254
2255         /* A new tickle, we must add it to the array */
2256         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2257                                         struct ctdb_tcp_connection,
2258                                         tcparray->num+1);
2259         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2260
2261         vnn->tcp_array = tcparray;
2262         tcparray->connections[tcparray->num].src_addr = p->src_addr;
2263         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2264         tcparray->num++;
2265                                 
2266         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2267                 ctdb_addr_to_str(&tcp.dst_addr),
2268                 ntohs(tcp.dst_addr.ip.sin_port),
2269                 vnn->pnn));
2270
2271         if (tcp_update_needed) {
2272                 vnn->tcp_update_needed = true;
2273         }
2274
2275         return 0;
2276 }
2277
2278
2279 /*
2280   called by a daemon to inform us of a TCP connection that one of its
2281   clients managing that should tickled with an ACK when IP takeover is
2282   done
2283  */
2284 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2285 {
2286         struct ctdb_tcp_connection *tcpp;
2287         struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2288
2289         if (vnn == NULL) {
2290                 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2291                         ctdb_addr_to_str(&conn->dst_addr)));
2292                 return;
2293         }
2294
2295         /* if the array is empty we cant remove it
2296            and we dont need to do anything
2297          */
2298         if (vnn->tcp_array == NULL) {
2299                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2300                         ctdb_addr_to_str(&conn->dst_addr),
2301                         ntohs(conn->dst_addr.ip.sin_port)));
2302                 return;
2303         }
2304
2305
2306         /* See if we know this connection
2307            if we dont know this connection  then we dont need to do anything
2308          */
2309         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2310         if (tcpp == NULL) {
2311                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2312                         ctdb_addr_to_str(&conn->dst_addr),
2313                         ntohs(conn->dst_addr.ip.sin_port)));
2314                 return;
2315         }
2316
2317
2318         /* We need to remove this entry from the array.
2319            Instead of allocating a new array and copying data to it
2320            we cheat and just copy the last entry in the existing array
2321            to the entry that is to be removed and just shring the 
2322            ->num field
2323          */
2324         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2325         vnn->tcp_array->num--;
2326
2327         /* If we deleted the last entry we also need to remove the entire array
2328          */
2329         if (vnn->tcp_array->num == 0) {
2330                 talloc_free(vnn->tcp_array);
2331                 vnn->tcp_array = NULL;
2332         }               
2333
2334         vnn->tcp_update_needed = true;
2335
2336         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2337                 ctdb_addr_to_str(&conn->src_addr),
2338                 ntohs(conn->src_addr.ip.sin_port)));
2339 }
2340
2341
2342 /*
2343   called by a daemon to inform us of a TCP connection that one of its
2344   clients used are no longer needed in the tickle database
2345  */
2346 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2347 {
2348         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
2349
2350         ctdb_remove_tcp_connection(ctdb, conn);
2351
2352         return 0;
2353 }
2354
2355
2356 /*
2357   called when a daemon restarts - send all tickes for all public addresses
2358   we are serving immediately to the new node.
2359  */
2360 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
2361 {
2362 /*XXX here we should send all tickes we are serving to the new node */
2363         return 0;
2364 }
2365
2366
2367 /*
2368   called when a client structure goes away - hook to remove
2369   elements from the tcp_list in all daemons
2370  */
2371 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2372 {
2373         while (client->tcp_list) {
2374                 struct ctdb_tcp_list *tcp = client->tcp_list;
2375                 DLIST_REMOVE(client->tcp_list, tcp);
2376                 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
2377         }
2378 }
2379
2380
2381 /*
2382   release all IPs on shutdown
2383  */
2384 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2385 {
2386         struct ctdb_vnn *vnn;
2387
2388         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2389                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2390                         ctdb_vnn_unassign_iface(ctdb, vnn);
2391                         continue;
2392                 }
2393                 if (!vnn->iface) {
2394                         continue;
2395                 }
2396                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2397                                   ctdb_vnn_iface_string(vnn),
2398                                   ctdb_addr_to_str(&vnn->public_address),
2399                                   vnn->public_netmask_bits);
2400                 release_kill_clients(ctdb, &vnn->public_address);
2401                 ctdb_vnn_unassign_iface(ctdb, vnn);
2402         }
2403 }
2404
2405
2406 /*
2407   get list of public IPs
2408  */
2409 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2410                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2411 {
2412         int i, num, len;
2413         struct ctdb_all_public_ips *ips;
2414         struct ctdb_vnn *vnn;
2415         bool only_available = false;
2416
2417         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2418                 only_available = true;
2419         }
2420
2421         /* count how many public ip structures we have */
2422         num = 0;
2423         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2424                 num++;
2425         }
2426
2427         len = offsetof(struct ctdb_all_public_ips, ips) + 
2428                 num*sizeof(struct ctdb_public_ip);
2429         ips = talloc_zero_size(outdata, len);
2430         CTDB_NO_MEMORY(ctdb, ips);
2431
2432         i = 0;
2433         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2434                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2435                         continue;
2436                 }
2437                 ips->ips[i].pnn  = vnn->pnn;
2438                 ips->ips[i].addr = vnn->public_address;
2439                 i++;
2440         }
2441         ips->num = i;
2442         len = offsetof(struct ctdb_all_public_ips, ips) +
2443                 i*sizeof(struct ctdb_public_ip);
2444
2445         outdata->dsize = len;
2446         outdata->dptr  = (uint8_t *)ips;
2447
2448         return 0;
2449 }
2450
2451
2452 /*
2453   get list of public IPs, old ipv4 style.  only returns ipv4 addresses
2454  */
2455 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
2456                                     struct ctdb_req_control *c, TDB_DATA *outdata)
2457 {
2458         int i, num, len;
2459         struct ctdb_all_public_ipsv4 *ips;
2460         struct ctdb_vnn *vnn;
2461
2462         /* count how many public ip structures we have */
2463         num = 0;
2464         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2465                 if (vnn->public_address.sa.sa_family != AF_INET) {
2466                         continue;
2467                 }
2468                 num++;
2469         }
2470
2471         len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
2472                 num*sizeof(struct ctdb_public_ipv4);
2473         ips = talloc_zero_size(outdata, len);
2474         CTDB_NO_MEMORY(ctdb, ips);
2475
2476         outdata->dsize = len;
2477         outdata->dptr  = (uint8_t *)ips;
2478
2479         ips->num = num;
2480         i = 0;
2481         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2482                 if (vnn->public_address.sa.sa_family != AF_INET) {
2483                         continue;
2484                 }
2485                 ips->ips[i].pnn = vnn->pnn;
2486                 ips->ips[i].sin = vnn->public_address.ip;
2487                 i++;
2488         }
2489
2490         return 0;
2491 }
2492
2493 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2494                                         struct ctdb_req_control *c,
2495                                         TDB_DATA indata,
2496                                         TDB_DATA *outdata)
2497 {
2498         int i, num, len;
2499         ctdb_sock_addr *addr;
2500         struct ctdb_control_public_ip_info *info;
2501         struct ctdb_vnn *vnn;
2502
2503         addr = (ctdb_sock_addr *)indata.dptr;
2504
2505         vnn = find_public_ip_vnn(ctdb, addr);
2506         if (vnn == NULL) {
2507                 /* if it is not a public ip   it could be our 'single ip' */
2508                 if (ctdb->single_ip_vnn) {
2509                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2510                                 vnn = ctdb->single_ip_vnn;
2511                         }
2512                 }
2513         }
2514         if (vnn == NULL) {
2515                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2516                                  "'%s'not a public address\n",
2517                                  ctdb_addr_to_str(addr)));
2518                 return -1;
2519         }
2520
2521         /* count how many public ip structures we have */
2522         num = 0;
2523         for (;vnn->ifaces[num];) {
2524                 num++;
2525         }
2526
2527         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2528                 num*sizeof(struct ctdb_control_iface_info);
2529         info = talloc_zero_size(outdata, len);
2530         CTDB_NO_MEMORY(ctdb, info);
2531
2532         info->ip.addr = vnn->public_address;
2533         info->ip.pnn = vnn->pnn;
2534         info->active_idx = 0xFFFFFFFF;
2535
2536         for (i=0; vnn->ifaces[i]; i++) {
2537                 struct ctdb_iface *cur;
2538
2539                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2540                 if (cur == NULL) {
2541                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2542                                            vnn->ifaces[i]));
2543                         return -1;
2544                 }
2545                 if (vnn->iface == cur) {
2546                         info->active_idx = i;
2547                 }
2548                 strcpy(info->ifaces[i].name, cur->name);
2549                 info->ifaces[i].link_state = cur->link_up;
2550                 info->ifaces[i].references = cur->references;
2551         }
2552         info->num = i;
2553         len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
2554                 i*sizeof(struct ctdb_control_iface_info);
2555
2556         outdata->dsize = len;
2557         outdata->dptr  = (uint8_t *)info;
2558
2559         return 0;
2560 }
2561
2562 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2563                                 struct ctdb_req_control *c,
2564                                 TDB_DATA *outdata)
2565 {
2566         int i, num, len;
2567         struct ctdb_control_get_ifaces *ifaces;
2568         struct ctdb_iface *cur;
2569
2570         /* count how many public ip structures we have */
2571         num = 0;
2572         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2573                 num++;
2574         }
2575
2576         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2577                 num*sizeof(struct ctdb_control_iface_info);
2578         ifaces = talloc_zero_size(outdata, len);
2579         CTDB_NO_MEMORY(ctdb, ifaces);
2580
2581         i = 0;
2582         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2583                 strcpy(ifaces->ifaces[i].name, cur->name);
2584                 ifaces->ifaces[i].link_state = cur->link_up;
2585                 ifaces->ifaces[i].references = cur->references;
2586                 i++;
2587         }
2588         ifaces->num = i;
2589         len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
2590                 i*sizeof(struct ctdb_control_iface_info);
2591
2592         outdata->dsize = len;
2593         outdata->dptr  = (uint8_t *)ifaces;
2594
2595         return 0;
2596 }
2597
2598 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2599                                     struct ctdb_req_control *c,
2600                                     TDB_DATA indata)
2601 {
2602         struct ctdb_control_iface_info *info;
2603         struct ctdb_iface *iface;
2604         bool link_up = false;
2605
2606         info = (struct ctdb_control_iface_info *)indata.dptr;
2607
2608         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2609                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2610                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2611                                   len, len, info->name));
2612                 return -1;
2613         }
2614
2615         switch (info->link_state) {
2616         case 0:
2617                 link_up = false;
2618                 break;
2619         case 1:
2620                 link_up = true;
2621                 break;
2622         default:
2623                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2624                                   (unsigned int)info->link_state));
2625                 return -1;
2626         }
2627
2628         if (info->references != 0) {
2629                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2630                                   (unsigned int)info->references));
2631                 return -1;
2632         }
2633
2634         iface = ctdb_find_iface(ctdb, info->name);
2635         if (iface == NULL) {
2636                 return -1;
2637         }
2638
2639         if (link_up == iface->link_up) {
2640                 return 0;
2641         }
2642
2643         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2644               ("iface[%s] has changed it's link status %s => %s\n",
2645                iface->name,
2646                iface->link_up?"up":"down",
2647                link_up?"up":"down"));
2648
2649         iface->link_up = link_up;
2650         return 0;
2651 }
2652
2653
2654 /* 
2655    structure containing the listening socket and the list of tcp connections
2656    that the ctdb daemon is to kill
2657 */
2658 struct ctdb_kill_tcp {
2659         struct ctdb_vnn *vnn;
2660         struct ctdb_context *ctdb;
2661         int capture_fd;
2662         struct fd_event *fde;
2663         trbt_tree_t *connections;
2664         void *private_data;
2665 };
2666
2667 /*
2668   a tcp connection that is to be killed
2669  */
2670 struct ctdb_killtcp_con {
2671         ctdb_sock_addr src_addr;
2672         ctdb_sock_addr dst_addr;
2673         int count;
2674         struct ctdb_kill_tcp *killtcp;
2675 };
2676
2677 /* this function is used to create a key to represent this socketpair
2678    in the killtcp tree.
2679    this key is used to insert and lookup matching socketpairs that are
2680    to be tickled and RST
2681 */
2682 #define KILLTCP_KEYLEN  10
2683 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2684 {
2685         static uint32_t key[KILLTCP_KEYLEN];
2686
2687         bzero(key, sizeof(key));
2688
2689         if (src->sa.sa_family != dst->sa.sa_family) {
2690                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2691                 return key;
2692         }
2693         
2694         switch (src->sa.sa_family) {
2695         case AF_INET:
2696                 key[0]  = dst->ip.sin_addr.s_addr;
2697                 key[1]  = src->ip.sin_addr.s_addr;
2698                 key[2]  = dst->ip.sin_port;
2699                 key[3]  = src->ip.sin_port;
2700                 break;
2701         case AF_INET6:
2702                 key[0]  = dst->ip6.sin6_addr.s6_addr32[3];
2703                 key[1]  = src->ip6.sin6_addr.s6_addr32[3];
2704                 key[2]  = dst->ip6.sin6_addr.s6_addr32[2];
2705                 key[3]  = src->ip6.sin6_addr.s6_addr32[2];
2706                 key[4]  = dst->ip6.sin6_addr.s6_addr32[1];
2707                 key[5]  = src->ip6.sin6_addr.s6_addr32[1];
2708                 key[6]  = dst->ip6.sin6_addr.s6_addr32[0];
2709                 key[7]  = src->ip6.sin6_addr.s6_addr32[0];
2710                 key[8]  = dst->ip6.sin6_port;
2711                 key[9]  = src->ip6.sin6_port;
2712                 break;
2713         default:
2714                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2715                 return key;
2716         }
2717
2718         return key;
2719 }
2720
2721 /*
2722   called when we get a read event on the raw socket
2723  */
2724 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
2725                                 uint16_t flags, void *private_data)
2726 {
2727         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2728         struct ctdb_killtcp_con *con;
2729         ctdb_sock_addr src, dst;
2730         uint32_t ack_seq, seq;
2731
2732         if (!(flags & EVENT_FD_READ)) {
2733                 return;
2734         }
2735
2736         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2737                                 killtcp->private_data,
2738                                 &src, &dst,
2739                                 &ack_seq, &seq) != 0) {
2740                 /* probably a non-tcp ACK packet */
2741                 return;
2742         }
2743
2744         /* check if we have this guy in our list of connections
2745            to kill
2746         */
2747         con = trbt_lookuparray32(killtcp->connections, 
2748                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2749         if (con == NULL) {
2750                 /* no this was some other packet we can just ignore */
2751                 return;
2752         }
2753
2754         /* This one has been tickled !
2755            now reset him and remove him from the list.
2756          */
2757         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2758                 ntohs(con->dst_addr.ip.sin_port),
2759                 ctdb_addr_to_str(&con->src_addr),
2760                 ntohs(con->src_addr.ip.sin_port)));
2761
2762         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2763         talloc_free(con);
2764 }
2765
2766
2767 /* when traversing the list of all tcp connections to send tickle acks to
2768    (so that we can capture the ack coming back and kill the connection
2769     by a RST)
2770    this callback is called for each connection we are currently trying to kill
2771 */
2772 static void tickle_connection_traverse(void *param, void *data)
2773 {
2774         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2775
2776         /* have tried too many times, just give up */
2777         if (con->count >= 5) {
2778                 /* can't delete in traverse: reparent to delete_cons */
2779                 talloc_steal(param, con);
2780                 return;
2781         }
2782
2783         /* othervise, try tickling it again */
2784         con->count++;
2785         ctdb_sys_send_tcp(
2786                 (ctdb_sock_addr *)&con->dst_addr,
2787                 (ctdb_sock_addr *)&con->src_addr,
2788                 0, 0, 0);
2789 }
2790
2791
2792 /* 
2793    called every second until all sentenced connections have been reset
2794  */
2795 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
2796                                               struct timeval t, void *private_data)
2797 {
2798         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2799         void *delete_cons = talloc_new(NULL);
2800
2801         /* loop over all connections sending tickle ACKs */
2802         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2803
2804         /* now we've finished traverse, it's safe to do deletion. */
2805         talloc_free(delete_cons);
2806
2807         /* If there are no more connections to kill we can remove the
2808            entire killtcp structure
2809          */
2810         if ( (killtcp->connections == NULL) || 
2811              (killtcp->connections->root == NULL) ) {
2812                 talloc_free(killtcp);
2813                 return;
2814         }
2815
2816         /* try tickling them again in a seconds time
2817          */
2818         event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2819                         ctdb_tickle_sentenced_connections, killtcp);
2820 }
2821
2822 /*
2823   destroy the killtcp structure
2824  */
2825 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2826 {
2827         if (killtcp->vnn) {
2828                 killtcp->vnn->killtcp = NULL;
2829         }
2830         return 0;
2831 }
2832
2833
2834 /* nothing fancy here, just unconditionally replace any existing
2835    connection structure with the new one.
2836
2837    dont even free the old one if it did exist, that one is talloc_stolen
2838    by the same node in the tree anyway and will be deleted when the new data 
2839    is deleted
2840 */
2841 static void *add_killtcp_callback(void *parm, void *data)
2842 {
2843         return parm;
2844 }
2845
2846 /*
2847   add a tcp socket to the list of connections we want to RST
2848  */
2849 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2850                                        ctdb_sock_addr *s,
2851                                        ctdb_sock_addr *d)
2852 {
2853         ctdb_sock_addr src, dst;
2854         struct ctdb_kill_tcp *killtcp;
2855         struct ctdb_killtcp_con *con;
2856         struct ctdb_vnn *vnn;
2857
2858         ctdb_canonicalize_ip(s, &src);
2859         ctdb_canonicalize_ip(d, &dst);
2860
2861         vnn = find_public_ip_vnn(ctdb, &dst);
2862         if (vnn == NULL) {
2863                 vnn = find_public_ip_vnn(ctdb, &src);
2864         }
2865         if (vnn == NULL) {
2866                 /* if it is not a public ip   it could be our 'single ip' */
2867                 if (ctdb->single_ip_vnn) {
2868                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2869                                 vnn = ctdb->single_ip_vnn;
2870                         }
2871                 }
2872         }
2873         if (vnn == NULL) {
2874                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2875                 return -1;
2876         }
2877
2878         killtcp = vnn->killtcp;
2879         
2880         /* If this is the first connection to kill we must allocate
2881            a new structure
2882          */
2883         if (killtcp == NULL) {
2884                 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2885                 CTDB_NO_MEMORY(ctdb, killtcp);
2886
2887                 killtcp->vnn         = vnn;
2888                 killtcp->ctdb        = ctdb;
2889                 killtcp->capture_fd  = -1;
2890                 killtcp->connections = trbt_create(killtcp, 0);
2891
2892                 vnn->killtcp         = killtcp;
2893                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2894         }
2895
2896
2897
2898         /* create a structure that describes this connection we want to
2899            RST and store it in killtcp->connections
2900         */
2901         con = talloc(killtcp, struct ctdb_killtcp_con);
2902         CTDB_NO_MEMORY(ctdb, con);
2903         con->src_addr = src;
2904         con->dst_addr = dst;
2905         con->count    = 0;
2906         con->killtcp  = killtcp;
2907
2908
2909         trbt_insertarray32_callback(killtcp->connections,
2910                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2911                         add_killtcp_callback, con);
2912
2913         /* 
2914            If we dont have a socket to listen on yet we must create it
2915          */
2916         if (killtcp->capture_fd == -1) {
2917                 const char *iface = ctdb_vnn_iface_string(vnn);
2918                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2919                 if (killtcp->capture_fd == -1) {
2920                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2921                                           "socket on iface '%s' for killtcp (%s)\n",
2922                                           iface, strerror(errno)));
2923                         goto failed;
2924                 }
2925         }
2926
2927
2928         if (killtcp->fde == NULL) {
2929                 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
2930                                             EVENT_FD_READ,
2931                                             capture_tcp_handler, killtcp);
2932                 tevent_fd_set_auto_close(killtcp->fde);
2933
2934                 /* We also need to set up some events to tickle all these connections
2935                    until they are all reset
2936                 */
2937                 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
2938                                 ctdb_tickle_sentenced_connections, killtcp);
2939         }
2940
2941         /* tickle him once now */
2942         ctdb_sys_send_tcp(
2943                 &con->dst_addr,
2944                 &con->src_addr,
2945                 0, 0, 0);
2946
2947         return 0;
2948
2949 failed:
2950         talloc_free(vnn->killtcp);
2951         vnn->killtcp = NULL;
2952         return -1;
2953 }
2954
2955 /*
2956   kill a TCP connection.
2957  */
2958 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2959 {
2960         struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2961
2962         return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2963 }
2964
2965 /*
2966   called by a daemon to inform us of the entire list of TCP tickles for
2967   a particular public address.
2968   this control should only be sent by the node that is currently serving
2969   that public address.
2970  */
2971 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2972 {
2973         struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2974         struct ctdb_tcp_array *tcparray;
2975         struct ctdb_vnn *vnn;
2976
2977         /* We must at least have tickles.num or else we cant verify the size
2978            of the received data blob
2979          */
2980         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2981                                         tickles.connections)) {
2982                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2983                 return -1;
2984         }
2985
2986         /* verify that the size of data matches what we expect */
2987         if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
2988                                 tickles.connections)
2989                          + sizeof(struct ctdb_tcp_connection)
2990                                  * list->tickles.num) {
2991                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2992                 return -1;
2993         }       
2994
2995         vnn = find_public_ip_vnn(ctdb, &list->addr);
2996         if (vnn == NULL) {
2997                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
2998                         ctdb_addr_to_str(&list->addr)));
2999
3000                 return 1;
3001         }
3002
3003         /* remove any old ticklelist we might have */
3004         talloc_free(vnn->tcp_array);
3005         vnn->tcp_array = NULL;
3006
3007         tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3008         CTDB_NO_MEMORY(ctdb, tcparray);
3009
3010         tcparray->num = list->tickles.num;
3011
3012         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3013         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3014
3015         memcpy(tcparray->connections, &list->tickles.connections[0], 
3016                sizeof(struct ctdb_tcp_connection)*tcparray->num);
3017
3018         /* We now have a new fresh tickle list array for this vnn */
3019         vnn->tcp_array = talloc_steal(vnn, tcparray);
3020         
3021         return 0;
3022 }
3023
3024 /*
3025   called to return the full list of tickles for the puclic address associated 
3026   with the provided vnn
3027  */
3028 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3029 {
3030         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3031         struct ctdb_control_tcp_tickle_list *list;
3032         struct ctdb_tcp_array *tcparray;
3033         int num;
3034         struct ctdb_vnn *vnn;
3035
3036         vnn = find_public_ip_vnn(ctdb, addr);
3037         if (vnn == NULL) {
3038                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3039                         ctdb_addr_to_str(addr)));
3040
3041                 return 1;
3042         }
3043
3044         tcparray = vnn->tcp_array;
3045         if (tcparray) {
3046                 num = tcparray->num;
3047         } else {
3048                 num = 0;
3049         }
3050
3051         outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3052                                 tickles.connections)
3053                         + sizeof(struct ctdb_tcp_connection) * num;
3054
3055         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3056         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3057         list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3058
3059         list->addr = *addr;
3060         list->tickles.num = num;
3061         if (num) {
3062                 memcpy(&list->tickles.connections[0], tcparray->connections, 
3063                         sizeof(struct ctdb_tcp_connection) * num);
3064         }
3065
3066         return 0;
3067 }
3068
3069
3070 /*
3071   set the list of all tcp tickles for a public address
3072  */
3073 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
3074                               struct timeval timeout, uint32_t destnode, 
3075                               ctdb_sock_addr *addr,
3076                               struct ctdb_tcp_array *tcparray)
3077 {
3078         int ret, num;
3079         TDB_DATA data;
3080         struct ctdb_control_tcp_tickle_list *list;
3081
3082         if (tcparray) {
3083                 num = tcparray->num;
3084         } else {
3085                 num = 0;
3086         }
3087
3088         data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
3089                                 tickles.connections) +
3090                         sizeof(struct ctdb_tcp_connection) * num;
3091         data.dptr = talloc_size(ctdb, data.dsize);
3092         CTDB_NO_MEMORY(ctdb, data.dptr);
3093
3094         list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3095         list->addr = *addr;
3096         list->tickles.num = num;
3097         if (tcparray) {
3098                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3099         }
3100
3101         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
3102                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3103                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3104         if (ret != 0) {
3105                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3106                 return -1;
3107         }
3108
3109         talloc_free(data.dptr);
3110
3111         return ret;
3112 }
3113
3114
3115 /*
3116   perform tickle updates if required
3117  */
3118 static void ctdb_update_tcp_tickles(struct event_context *ev, 
3119                                 struct timed_event *te, 
3120                                 struct timeval t, void *private_data)
3121 {
3122         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3123         int ret;
3124         struct ctdb_vnn *vnn;
3125
3126         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3127                 /* we only send out updates for public addresses that 
3128                    we have taken over
3129                  */
3130                 if (ctdb->pnn != vnn->pnn) {
3131                         continue;
3132                 }
3133                 /* We only send out the updates if we need to */
3134                 if (!vnn->tcp_update_needed) {
3135                         continue;
3136                 }
3137                 ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
3138                                 TAKEOVER_TIMEOUT(),
3139                                 CTDB_BROADCAST_CONNECTED,
3140                                 &vnn->public_address,
3141                                 vnn->tcp_array);
3142                 if (ret != 0) {
3143                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3144                                 ctdb_addr_to_str(&vnn->public_address)));
3145                 }
3146         }
3147
3148         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3149                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3150                              ctdb_update_tcp_tickles, ctdb);
3151 }               
3152         
3153
3154 /*
3155   start periodic update of tcp tickles
3156  */
3157 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3158 {
3159         ctdb->tickle_update_context = talloc_new(ctdb);
3160
3161         event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3162                              timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
3163                              ctdb_update_tcp_tickles, ctdb);
3164 }
3165
3166
3167
3168
3169 struct control_gratious_arp {
3170         struct ctdb_context *ctdb;
3171         ctdb_sock_addr addr;
3172         const char *iface;
3173         int count;
3174 };
3175
3176 /*
3177   send a control_gratuitous arp
3178  */
3179 static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
3180                                   struct timeval t, void *private_data)
3181 {
3182         int ret;
3183         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3184                                                         struct control_gratious_arp);
3185
3186         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3187         if (ret != 0) {
3188                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3189                                  arp->iface, strerror(errno)));
3190         }
3191
3192
3193         arp->count++;
3194         if (arp->count == CTDB_ARP_REPEAT) {
3195                 talloc_free(arp);
3196                 return;
3197         }
3198
3199         event_add_timed(arp->ctdb->ev, arp, 
3200                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
3201                         send_gratious_arp, arp);
3202 }
3203
3204
3205 /*
3206   send a gratious arp 
3207  */
3208 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3209 {
3210         struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3211         struct control_gratious_arp *arp;
3212
3213         /* verify the size of indata */
3214         if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3215                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3216                                  (unsigned)indata.dsize, 
3217                                  (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3218                 return -1;
3219         }
3220         if (indata.dsize != 
3221                 ( offsetof(struct ctdb_control_gratious_arp, iface)
3222                 + gratious_arp->len ) ){
3223
3224                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3225                         "but should be %u bytes\n", 
3226                          (unsigned)indata.dsize, 
3227                          (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3228                 return -1;
3229         }
3230
3231
3232         arp = talloc(ctdb, struct control_gratious_arp);
3233         CTDB_NO_MEMORY(ctdb, arp);
3234
3235         arp->ctdb  = ctdb;
3236         arp->addr   = gratious_arp->addr;
3237         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3238         CTDB_NO_MEMORY(ctdb, arp->iface);
3239         arp->count = 0;
3240         
3241         event_add_timed(arp->ctdb->ev, arp, 
3242                         timeval_zero(), send_gratious_arp, arp);
3243
3244         return 0;
3245 }
3246
3247 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3248 {
3249         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3250         int ret;
3251
3252         /* verify the size of indata */
3253         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3254                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3255                 return -1;
3256         }
3257         if (indata.dsize != 
3258                 ( offsetof(struct ctdb_control_ip_iface, iface)
3259                 + pub->len ) ){
3260
3261                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3262                         "but should be %u bytes\n", 
3263                          (unsigned)indata.dsize, 
3264                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3265                 return -1;
3266         }
3267
3268         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
3269
3270         if (ret != 0) {
3271                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3272                 return -1;
3273         }
3274
3275         return 0;
3276 }
3277
3278 /*
3279   called when releaseip event finishes for del_public_address
3280  */
3281 static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
3282                                 void *private_data)
3283 {
3284         talloc_free(private_data);
3285 }
3286
3287 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3288 {
3289         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3290         struct ctdb_vnn *vnn;
3291         int ret;
3292
3293         /* verify the size of indata */
3294         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3295                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3296                 return -1;
3297         }
3298         if (indata.dsize != 
3299                 ( offsetof(struct ctdb_control_ip_iface, iface)
3300                 + pub->len ) ){
3301
3302                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3303                         "but should be %u bytes\n", 
3304                          (unsigned)indata.dsize, 
3305                          (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3306                 return -1;
3307         }
3308
3309         /* walk over all public addresses until we find a match */
3310         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3311                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3312                         TALLOC_CTX *mem_ctx;
3313
3314                         DLIST_REMOVE(ctdb->vnn, vnn);
3315                         if (vnn->iface != NULL) {
3316                                 ctdb_vnn_unassign_iface(ctdb, vnn);
3317                         }
3318                         if (vnn->pnn != ctdb->pnn) {
3319                                 talloc_free(vnn);
3320                                 return 0;
3321                         }
3322
3323                         mem_ctx = talloc_new(ctdb);
3324                         talloc_steal(mem_ctx, vnn);
3325                         ret = ctdb_event_script_callback(ctdb, 
3326                                          mem_ctx, delete_ip_callback, mem_ctx,
3327                                          false,
3328                                          CTDB_EVENT_RELEASE_IP,
3329                                          "%s %s %u",
3330                                          ctdb_vnn_iface_string(vnn),
3331                                          ctdb_addr_to_str(&vnn->public_address),
3332                                          vnn->public_netmask_bits);
3333                         if (ret != 0) {
3334                                 return -1;
3335                         }
3336                         return 0;
3337                 }
3338         }
3339
3340         return -1;
3341 }
3342
3343 /* This function is called from the recovery daemon to verify that a remote
3344    node has the expected ip allocation.
3345    This is verified against ctdb->ip_tree
3346 */
3347 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
3348 {
3349         struct ctdb_public_ip_list *tmp_ip; 
3350         int i;
3351
3352         if (ctdb->ip_tree == NULL) {
3353                 /* dont know the expected allocation yet, assume remote node
3354                    is correct. */
3355                 return 0;
3356         }
3357
3358         if (ips == NULL) {
3359                 return 0;
3360         }
3361
3362         for (i=0; i<ips->num; i++) {
3363                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3364                 if (tmp_ip == NULL) {
3365                         DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
3366                         return -1;
3367                 }
3368
3369                 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3370                         continue;
3371                 }
3372
3373                 if (tmp_ip->pnn != ips->ips[i].pnn) {
3374                         DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
3375                         return -1;
3376                 }
3377         }
3378
3379         return 0;
3380 }
3381
3382 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3383 {
3384         struct ctdb_public_ip_list *tmp_ip; 
3385
3386         if (ctdb->ip_tree == NULL) {
3387                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3388                 return -1;
3389         }
3390
3391         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3392         if (tmp_ip == NULL) {
3393                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3394                 return -1;
3395         }
3396
3397         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3398         tmp_ip->pnn = ip->pnn;
3399
3400         return 0;
3401 }