ctdb-daemon: Replace an unsafe strcpy(3) call
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_takeover.c
1 /* 
2    ctdb ip takeover code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12    
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17    
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
34
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
37
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
43
44 #include "server/ipalloc.h"
45
46 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47
48 #define CTDB_ARP_INTERVAL 1
49 #define CTDB_ARP_REPEAT   3
50
51 struct ctdb_interface {
52         struct ctdb_interface *prev, *next;
53         const char *name;
54         bool link_up;
55         uint32_t references;
56 };
57
58 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
59 {
60         if (vnn->iface) {
61                 return vnn->iface->name;
62         }
63
64         return "__none__";
65 }
66
67 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
68 {
69         struct ctdb_interface *i;
70
71         if (strlen(iface) > CTDB_IFACE_SIZE) {
72                 DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface));
73                 return -1;
74         }
75
76         /* Verify that we don't have an entry for this ip yet */
77         for (i=ctdb->ifaces;i;i=i->next) {
78                 if (strcmp(i->name, iface) == 0) {
79                         return 0;
80                 }
81         }
82
83         /* create a new structure for this interface */
84         i = talloc_zero(ctdb, struct ctdb_interface);
85         CTDB_NO_MEMORY_FATAL(ctdb, i);
86         i->name = talloc_strdup(i, iface);
87         CTDB_NO_MEMORY(ctdb, i->name);
88
89         i->link_up = true;
90
91         DLIST_ADD(ctdb->ifaces, i);
92
93         return 0;
94 }
95
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
97                                         const char *name)
98 {
99         int n;
100
101         for (n = 0; vnn->ifaces[n] != NULL; n++) {
102                 if (strcmp(name, vnn->ifaces[n]) == 0) {
103                         return true;
104                 }
105         }
106
107         return false;
108 }
109
110 /* If any interfaces now have no possible IPs then delete them.  This
111  * implementation is naive (i.e. simple) rather than clever
112  * (i.e. complex).  Given that this is run on delip and that operation
113  * is rare, this doesn't need to be efficient - it needs to be
114  * foolproof.  One alternative is reference counting, where the logic
115  * is distributed and can, therefore, be broken in multiple places.
116  * Another alternative is to build a red-black tree of interfaces that
117  * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118  * once) and then walking ctdb->ifaces once and deleting those not in
119  * the tree.  Let's go to one of those if the naive implementation
120  * causes problems...  :-)
121  */
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123                                         struct ctdb_vnn *vnn)
124 {
125         struct ctdb_interface *i, *next;
126
127         /* For each interface, check if there's an IP using it. */
128         for (i = ctdb->ifaces; i != NULL; i = next) {
129                 struct ctdb_vnn *tv;
130                 bool found;
131                 next = i->next;
132
133                 /* Only consider interfaces named in the given VNN. */
134                 if (!vnn_has_interface_with_name(vnn, i->name)) {
135                         continue;
136                 }
137
138                 /* Is the "single IP" on this interface? */
139                 if ((ctdb->single_ip_vnn != NULL) &&
140                     (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141                     (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142                         /* Found, next interface please... */
143                         continue;
144                 }
145                 /* Search for a vnn with this interface. */
146                 found = false;
147                 for (tv=ctdb->vnn; tv; tv=tv->next) {
148                         if (vnn_has_interface_with_name(tv, i->name)) {
149                                 found = true;
150                                 break;
151                         }
152                 }
153
154                 if (!found) {
155                         /* None of the VNNs are using this interface. */
156                         DLIST_REMOVE(ctdb->ifaces, i);
157                         talloc_free(i);
158                 }
159         }
160 }
161
162
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
164                                               const char *iface)
165 {
166         struct ctdb_interface *i;
167
168         for (i=ctdb->ifaces;i;i=i->next) {
169                 if (strcmp(i->name, iface) == 0) {
170                         return i;
171                 }
172         }
173
174         return NULL;
175 }
176
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178                                                   struct ctdb_vnn *vnn)
179 {
180         int i;
181         struct ctdb_interface *cur = NULL;
182         struct ctdb_interface *best = NULL;
183
184         for (i=0; vnn->ifaces[i]; i++) {
185
186                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
187                 if (cur == NULL) {
188                         continue;
189                 }
190
191                 if (!cur->link_up) {
192                         continue;
193                 }
194
195                 if (best == NULL) {
196                         best = cur;
197                         continue;
198                 }
199
200                 if (cur->references < best->references) {
201                         best = cur;
202                         continue;
203                 }
204         }
205
206         return best;
207 }
208
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210                                      struct ctdb_vnn *vnn)
211 {
212         struct ctdb_interface *best = NULL;
213
214         if (vnn->iface) {
215                 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216                                    "still assigned to iface '%s'\n",
217                                    ctdb_addr_to_str(&vnn->public_address),
218                                    ctdb_vnn_iface_string(vnn)));
219                 return 0;
220         }
221
222         best = ctdb_vnn_best_iface(ctdb, vnn);
223         if (best == NULL) {
224                 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225                                   "cannot assign to iface any iface\n",
226                                   ctdb_addr_to_str(&vnn->public_address)));
227                 return -1;
228         }
229
230         vnn->iface = best;
231         best->references++;
232         vnn->pnn = ctdb->pnn;
233
234         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235                            "now assigned to iface '%s' refs[%d]\n",
236                            ctdb_addr_to_str(&vnn->public_address),
237                            ctdb_vnn_iface_string(vnn),
238                            best->references));
239         return 0;
240 }
241
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243                                     struct ctdb_vnn *vnn)
244 {
245         DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246                            "now unassigned (old iface '%s' refs[%d])\n",
247                            ctdb_addr_to_str(&vnn->public_address),
248                            ctdb_vnn_iface_string(vnn),
249                            vnn->iface?vnn->iface->references:0));
250         if (vnn->iface) {
251                 vnn->iface->references--;
252         }
253         vnn->iface = NULL;
254         if (vnn->pnn == ctdb->pnn) {
255                 vnn->pnn = -1;
256         }
257 }
258
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260                                struct ctdb_vnn *vnn)
261 {
262         int i;
263
264         /* Nodes that are not RUNNING can not host IPs */
265         if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
266                 return false;
267         }
268
269         if (vnn->delete_pending) {
270                 return false;
271         }
272
273         if (vnn->iface && vnn->iface->link_up) {
274                 return true;
275         }
276
277         for (i=0; vnn->ifaces[i]; i++) {
278                 struct ctdb_interface *cur;
279
280                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
281                 if (cur == NULL) {
282                         continue;
283                 }
284
285                 if (cur->link_up) {
286                         return true;
287                 }
288         }
289
290         return false;
291 }
292
293 struct ctdb_takeover_arp {
294         struct ctdb_context *ctdb;
295         uint32_t count;
296         ctdb_sock_addr addr;
297         struct ctdb_tcp_array *tcparray;
298         struct ctdb_vnn *vnn;
299 };
300
301
302 /*
303   lists of tcp endpoints
304  */
305 struct ctdb_tcp_list {
306         struct ctdb_tcp_list *prev, *next;
307         struct ctdb_connection connection;
308 };
309
310 /*
311   list of clients to kill on IP release
312  */
313 struct ctdb_client_ip {
314         struct ctdb_client_ip *prev, *next;
315         struct ctdb_context *ctdb;
316         ctdb_sock_addr addr;
317         uint32_t client_id;
318 };
319
320
321 /*
322   send a gratuitous arp
323  */
324 static void ctdb_control_send_arp(struct tevent_context *ev,
325                                   struct tevent_timer *te,
326                                   struct timeval t, void *private_data)
327 {
328         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
329                                                         struct ctdb_takeover_arp);
330         int i, ret;
331         struct ctdb_tcp_array *tcparray;
332         const char *iface = ctdb_vnn_iface_string(arp->vnn);
333
334         ret = ctdb_sys_send_arp(&arp->addr, iface);
335         if (ret != 0) {
336                 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
337                                   iface, strerror(errno)));
338         }
339
340         tcparray = arp->tcparray;
341         if (tcparray) {
342                 for (i=0;i<tcparray->num;i++) {
343                         struct ctdb_connection *tcon;
344
345                         tcon = &tcparray->connections[i];
346                         DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
347                                 (unsigned)ntohs(tcon->dst.ip.sin_port),
348                                 ctdb_addr_to_str(&tcon->src),
349                                 (unsigned)ntohs(tcon->src.ip.sin_port)));
350                         ret = ctdb_sys_send_tcp(
351                                 &tcon->src,
352                                 &tcon->dst,
353                                 0, 0, 0);
354                         if (ret != 0) {
355                                 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
356                                         ctdb_addr_to_str(&tcon->src)));
357                         }
358                 }
359         }
360
361         arp->count++;
362
363         if (arp->count == CTDB_ARP_REPEAT) {
364                 talloc_free(arp);
365                 return;
366         }
367
368         tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
369                          timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
370                          ctdb_control_send_arp, arp);
371 }
372
373 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
374                                        struct ctdb_vnn *vnn)
375 {
376         struct ctdb_takeover_arp *arp;
377         struct ctdb_tcp_array *tcparray;
378
379         if (!vnn->takeover_ctx) {
380                 vnn->takeover_ctx = talloc_new(vnn);
381                 if (!vnn->takeover_ctx) {
382                         return -1;
383                 }
384         }
385
386         arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
387         if (!arp) {
388                 return -1;
389         }
390
391         arp->ctdb = ctdb;
392         arp->addr = vnn->public_address;
393         arp->vnn  = vnn;
394
395         tcparray = vnn->tcp_array;
396         if (tcparray) {
397                 /* add all of the known tcp connections for this IP to the
398                    list of tcp connections to send tickle acks for */
399                 arp->tcparray = talloc_steal(arp, tcparray);
400
401                 vnn->tcp_array = NULL;
402                 vnn->tcp_update_needed = true;
403         }
404
405         tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
406                          timeval_zero(), ctdb_control_send_arp, arp);
407
408         return 0;
409 }
410
411 struct takeover_callback_state {
412         struct ctdb_req_control_old *c;
413         ctdb_sock_addr *addr;
414         struct ctdb_vnn *vnn;
415 };
416
417 struct ctdb_do_takeip_state {
418         struct ctdb_req_control_old *c;
419         struct ctdb_vnn *vnn;
420 };
421
422 /*
423   called when takeip event finishes
424  */
425 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
426                                     void *private_data)
427 {
428         struct ctdb_do_takeip_state *state =
429                 talloc_get_type(private_data, struct ctdb_do_takeip_state);
430         int32_t ret;
431         TDB_DATA data;
432
433         if (status != 0) {
434                 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
435         
436                 if (status == -ETIME) {
437                         ctdb_ban_self(ctdb);
438                 }
439                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
440                                  ctdb_addr_to_str(&state->vnn->public_address),
441                                  ctdb_vnn_iface_string(state->vnn)));
442                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
443
444                 node->flags |= NODE_FLAGS_UNHEALTHY;
445                 talloc_free(state);
446                 return;
447         }
448
449         if (ctdb->do_checkpublicip) {
450
451         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
452         if (ret != 0) {
453                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
454                 talloc_free(state);
455                 return;
456         }
457
458         }
459
460         data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
461         data.dsize = strlen((char *)data.dptr) + 1;
462         DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
463
464         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
465
466
467         /* the control succeeded */
468         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
469         talloc_free(state);
470         return;
471 }
472
473 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
474 {
475         state->vnn->update_in_flight = false;
476         return 0;
477 }
478
479 /*
480   take over an ip address
481  */
482 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
483                               struct ctdb_req_control_old *c,
484                               struct ctdb_vnn *vnn)
485 {
486         int ret;
487         struct ctdb_do_takeip_state *state;
488
489         if (vnn->update_in_flight) {
490                 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
491                                     "update for this IP already in flight\n",
492                                     ctdb_addr_to_str(&vnn->public_address),
493                                     vnn->public_netmask_bits));
494                 return -1;
495         }
496
497         ret = ctdb_vnn_assign_iface(ctdb, vnn);
498         if (ret != 0) {
499                 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
500                                  "assign a usable interface\n",
501                                  ctdb_addr_to_str(&vnn->public_address),
502                                  vnn->public_netmask_bits));
503                 return -1;
504         }
505
506         state = talloc(vnn, struct ctdb_do_takeip_state);
507         CTDB_NO_MEMORY(ctdb, state);
508
509         state->c = talloc_steal(ctdb, c);
510         state->vnn   = vnn;
511
512         vnn->update_in_flight = true;
513         talloc_set_destructor(state, ctdb_takeip_destructor);
514
515         DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
516                             ctdb_addr_to_str(&vnn->public_address),
517                             vnn->public_netmask_bits,
518                             ctdb_vnn_iface_string(vnn)));
519
520         ret = ctdb_event_script_callback(ctdb,
521                                          state,
522                                          ctdb_do_takeip_callback,
523                                          state,
524                                          CTDB_EVENT_TAKE_IP,
525                                          "%s %s %u",
526                                          ctdb_vnn_iface_string(vnn),
527                                          ctdb_addr_to_str(&vnn->public_address),
528                                          vnn->public_netmask_bits);
529
530         if (ret != 0) {
531                 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
532                         ctdb_addr_to_str(&vnn->public_address),
533                         ctdb_vnn_iface_string(vnn)));
534                 talloc_free(state);
535                 return -1;
536         }
537
538         return 0;
539 }
540
541 struct ctdb_do_updateip_state {
542         struct ctdb_req_control_old *c;
543         struct ctdb_interface *old;
544         struct ctdb_vnn *vnn;
545 };
546
547 /*
548   called when updateip event finishes
549  */
550 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
551                                       void *private_data)
552 {
553         struct ctdb_do_updateip_state *state =
554                 talloc_get_type(private_data, struct ctdb_do_updateip_state);
555         int32_t ret;
556
557         if (status != 0) {
558                 if (status == -ETIME) {
559                         ctdb_ban_self(ctdb);
560                 }
561                 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
562                         ctdb_addr_to_str(&state->vnn->public_address),
563                         state->old->name,
564                         ctdb_vnn_iface_string(state->vnn)));
565
566                 /*
567                  * All we can do is reset the old interface
568                  * and let the next run fix it
569                  */
570                 ctdb_vnn_unassign_iface(ctdb, state->vnn);
571                 state->vnn->iface = state->old;
572                 state->vnn->iface->references++;
573
574                 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
575                 talloc_free(state);
576                 return;
577         }
578
579         if (ctdb->do_checkpublicip) {
580
581         ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
582         if (ret != 0) {
583                 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
584                 talloc_free(state);
585                 return;
586         }
587
588         }
589
590         /* the control succeeded */
591         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
592         talloc_free(state);
593         return;
594 }
595
596 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
597 {
598         state->vnn->update_in_flight = false;
599         return 0;
600 }
601
602 /*
603   update (move) an ip address
604  */
605 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
606                                 struct ctdb_req_control_old *c,
607                                 struct ctdb_vnn *vnn)
608 {
609         int ret;
610         struct ctdb_do_updateip_state *state;
611         struct ctdb_interface *old = vnn->iface;
612         const char *new_name;
613
614         if (vnn->update_in_flight) {
615                 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
616                                     "update for this IP already in flight\n",
617                                     ctdb_addr_to_str(&vnn->public_address),
618                                     vnn->public_netmask_bits));
619                 return -1;
620         }
621
622         ctdb_vnn_unassign_iface(ctdb, vnn);
623         ret = ctdb_vnn_assign_iface(ctdb, vnn);
624         if (ret != 0) {
625                 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
626                                  "assin a usable interface (old iface '%s')\n",
627                                  ctdb_addr_to_str(&vnn->public_address),
628                                  vnn->public_netmask_bits,
629                                  old->name));
630                 return -1;
631         }
632
633         new_name = ctdb_vnn_iface_string(vnn);
634         if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
635                 /* A benign update from one interface onto itself.
636                  * no need to run the eventscripts in this case, just return
637                  * success.
638                  */
639                 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
640                 return 0;
641         }
642
643         state = talloc(vnn, struct ctdb_do_updateip_state);
644         CTDB_NO_MEMORY(ctdb, state);
645
646         state->c = talloc_steal(ctdb, c);
647         state->old = old;
648         state->vnn = vnn;
649
650         vnn->update_in_flight = true;
651         talloc_set_destructor(state, ctdb_updateip_destructor);
652
653         DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
654                             "interface %s to %s\n",
655                             ctdb_addr_to_str(&vnn->public_address),
656                             vnn->public_netmask_bits,
657                             old->name,
658                             new_name));
659
660         ret = ctdb_event_script_callback(ctdb,
661                                          state,
662                                          ctdb_do_updateip_callback,
663                                          state,
664                                          CTDB_EVENT_UPDATE_IP,
665                                          "%s %s %s %u",
666                                          state->old->name,
667                                          new_name,
668                                          ctdb_addr_to_str(&vnn->public_address),
669                                          vnn->public_netmask_bits);
670         if (ret != 0) {
671                 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
672                                  ctdb_addr_to_str(&vnn->public_address),
673                                  old->name, new_name));
674                 talloc_free(state);
675                 return -1;
676         }
677
678         return 0;
679 }
680
681 /*
682   Find the vnn of the node that has a public ip address
683   returns -1 if the address is not known as a public address
684  */
685 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
686 {
687         struct ctdb_vnn *vnn;
688
689         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
690                 if (ctdb_same_ip(&vnn->public_address, addr)) {
691                         return vnn;
692                 }
693         }
694
695         return NULL;
696 }
697
698 /*
699   take over an ip address
700  */
701 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
702                                  struct ctdb_req_control_old *c,
703                                  TDB_DATA indata,
704                                  bool *async_reply)
705 {
706         int ret;
707         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
708         struct ctdb_vnn *vnn;
709         bool have_ip = false;
710         bool do_updateip = false;
711         bool do_takeip = false;
712         struct ctdb_interface *best_iface = NULL;
713
714         if (pip->pnn != ctdb->pnn) {
715                 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
716                                  "with pnn %d, but we're node %d\n",
717                                  ctdb_addr_to_str(&pip->addr),
718                                  pip->pnn, ctdb->pnn));
719                 return -1;
720         }
721
722         /* update out vnn list */
723         vnn = find_public_ip_vnn(ctdb, &pip->addr);
724         if (vnn == NULL) {
725                 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
726                         ctdb_addr_to_str(&pip->addr)));
727                 return 0;
728         }
729
730         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
731                 have_ip = ctdb_sys_have_ip(&pip->addr);
732         }
733         best_iface = ctdb_vnn_best_iface(ctdb, vnn);
734         if (best_iface == NULL) {
735                 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
736                                  "a usable interface (old %s, have_ip %d)\n",
737                                  ctdb_addr_to_str(&vnn->public_address),
738                                  vnn->public_netmask_bits,
739                                  ctdb_vnn_iface_string(vnn),
740                                  have_ip));
741                 return -1;
742         }
743
744         if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
745                 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
746                 have_ip = false;
747         }
748
749
750         if (vnn->iface == NULL && have_ip) {
751                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752                                   "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
753                                  ctdb_addr_to_str(&vnn->public_address)));
754                 return 0;
755         }
756
757         if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
758                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759                                   "and we have it on iface[%s], but it was assigned to node %d"
760                                   "and we are node %d, banning ourself\n",
761                                  ctdb_addr_to_str(&vnn->public_address),
762                                  ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
763                 ctdb_ban_self(ctdb);
764                 return -1;
765         }
766
767         if (vnn->pnn == -1 && have_ip) {
768                 vnn->pnn = ctdb->pnn;
769                 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770                                   "and we already have it on iface[%s], update local daemon\n",
771                                  ctdb_addr_to_str(&vnn->public_address),
772                                   ctdb_vnn_iface_string(vnn)));
773                 return 0;
774         }
775
776         if (vnn->iface) {
777                 if (vnn->iface != best_iface) {
778                         if (!vnn->iface->link_up) {
779                                 do_updateip = true;
780                         } else if (vnn->iface->references > (best_iface->references + 1)) {
781                                 /* only move when the rebalance gains something */
782                                         do_updateip = true;
783                         }
784                 }
785         }
786
787         if (!have_ip) {
788                 if (do_updateip) {
789                         ctdb_vnn_unassign_iface(ctdb, vnn);
790                         do_updateip = false;
791                 }
792                 do_takeip = true;
793         }
794
795         if (do_takeip) {
796                 ret = ctdb_do_takeip(ctdb, c, vnn);
797                 if (ret != 0) {
798                         return -1;
799                 }
800         } else if (do_updateip) {
801                 ret = ctdb_do_updateip(ctdb, c, vnn);
802                 if (ret != 0) {
803                         return -1;
804                 }
805         } else {
806                 /*
807                  * The interface is up and the kernel known the ip
808                  * => do nothing
809                  */
810                 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
811                         ctdb_addr_to_str(&pip->addr),
812                         vnn->public_netmask_bits,
813                         ctdb_vnn_iface_string(vnn)));
814                 return 0;
815         }
816
817         /* tell ctdb_control.c that we will be replying asynchronously */
818         *async_reply = true;
819
820         return 0;
821 }
822
823 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
824 {
825         DLIST_REMOVE(ctdb->vnn, vnn);
826         ctdb_vnn_unassign_iface(ctdb, vnn);
827         ctdb_remove_orphaned_ifaces(ctdb, vnn);
828         talloc_free(vnn);
829 }
830
831 /*
832   called when releaseip event finishes
833  */
834 static void release_ip_callback(struct ctdb_context *ctdb, int status, 
835                                 void *private_data)
836 {
837         struct takeover_callback_state *state = 
838                 talloc_get_type(private_data, struct takeover_callback_state);
839         TDB_DATA data;
840
841         if (status == -ETIME) {
842                 ctdb_ban_self(ctdb);
843         }
844
845         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
846                 if  (ctdb_sys_have_ip(state->addr)) {
847                         DEBUG(DEBUG_ERR,
848                               ("IP %s still hosted during release IP callback, failing\n",
849                                ctdb_addr_to_str(state->addr)));
850                         ctdb_request_control_reply(ctdb, state->c,
851                                                    NULL, -1, NULL);
852                         talloc_free(state);
853                         return;
854                 }
855         }
856
857         /* send a message to all clients of this node telling them
858            that the cluster has been reconfigured and they should
859            release any sockets on this IP */
860         data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
861         CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
862         data.dsize = strlen((char *)data.dptr)+1;
863
864         DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
865
866         ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
867
868         ctdb_vnn_unassign_iface(ctdb, state->vnn);
869
870         /* Process the IP if it has been marked for deletion */
871         if (state->vnn->delete_pending) {
872                 do_delete_ip(ctdb, state->vnn);
873                 state->vnn = NULL;
874         }
875
876         /* the control succeeded */
877         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
878         talloc_free(state);
879 }
880
881 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
882 {
883         if (state->vnn != NULL) {
884                 state->vnn->update_in_flight = false;
885         }
886         return 0;
887 }
888
889 /*
890   release an ip address
891  */
892 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
893                                 struct ctdb_req_control_old *c,
894                                 TDB_DATA indata, 
895                                 bool *async_reply)
896 {
897         int ret;
898         struct takeover_callback_state *state;
899         struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
900         struct ctdb_vnn *vnn;
901         char *iface;
902
903         /* update our vnn list */
904         vnn = find_public_ip_vnn(ctdb, &pip->addr);
905         if (vnn == NULL) {
906                 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
907                         ctdb_addr_to_str(&pip->addr)));
908                 return 0;
909         }
910         vnn->pnn = pip->pnn;
911
912         /* stop any previous arps */
913         talloc_free(vnn->takeover_ctx);
914         vnn->takeover_ctx = NULL;
915
916         /* Some ctdb tool commands (e.g. moveip) send
917          * lazy multicast to drop an IP from any node that isn't the
918          * intended new node.  The following causes makes ctdbd ignore
919          * a release for any address it doesn't host.
920          */
921         if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
922                 if (!ctdb_sys_have_ip(&pip->addr)) {
923                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
924                                 ctdb_addr_to_str(&pip->addr),
925                                 vnn->public_netmask_bits,
926                                 ctdb_vnn_iface_string(vnn)));
927                         ctdb_vnn_unassign_iface(ctdb, vnn);
928                         return 0;
929                 }
930         } else {
931                 if (vnn->iface == NULL) {
932                         DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
933                                            ctdb_addr_to_str(&pip->addr),
934                                            vnn->public_netmask_bits));
935                         return 0;
936                 }
937         }
938
939         /* There is a potential race between take_ip and us because we
940          * update the VNN via a callback that run when the
941          * eventscripts have been run.  Avoid the race by allowing one
942          * update to be in flight at a time.
943          */
944         if (vnn->update_in_flight) {
945                 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
946                                     "update for this IP already in flight\n",
947                                     ctdb_addr_to_str(&vnn->public_address),
948                                     vnn->public_netmask_bits));
949                 return -1;
950         }
951
952         iface = strdup(ctdb_vnn_iface_string(vnn));
953
954         DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
955                 ctdb_addr_to_str(&pip->addr),
956                 vnn->public_netmask_bits,
957                 iface,
958                 pip->pnn));
959
960         state = talloc(ctdb, struct takeover_callback_state);
961         if (state == NULL) {
962                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
963                                __FILE__, __LINE__);
964                 free(iface);
965                 return -1;
966         }
967
968         state->c = talloc_steal(state, c);
969         state->addr = talloc(state, ctdb_sock_addr);       
970         if (state->addr == NULL) {
971                 ctdb_set_error(ctdb, "Out of memory at %s:%d",
972                                __FILE__, __LINE__);
973                 free(iface);
974                 talloc_free(state);
975                 return -1;
976         }
977         *state->addr = pip->addr;
978         state->vnn   = vnn;
979
980         vnn->update_in_flight = true;
981         talloc_set_destructor(state, ctdb_releaseip_destructor);
982
983         ret = ctdb_event_script_callback(ctdb, 
984                                          state, release_ip_callback, state,
985                                          CTDB_EVENT_RELEASE_IP,
986                                          "%s %s %u",
987                                          iface,
988                                          ctdb_addr_to_str(&pip->addr),
989                                          vnn->public_netmask_bits);
990         free(iface);
991         if (ret != 0) {
992                 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
993                         ctdb_addr_to_str(&pip->addr),
994                         ctdb_vnn_iface_string(vnn)));
995                 talloc_free(state);
996                 return -1;
997         }
998
999         /* tell the control that we will be reply asynchronously */
1000         *async_reply = true;
1001         return 0;
1002 }
1003
1004 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1005                                    ctdb_sock_addr *addr,
1006                                    unsigned mask, const char *ifaces,
1007                                    bool check_address)
1008 {
1009         struct ctdb_vnn      *vnn;
1010         uint32_t num = 0;
1011         char *tmp;
1012         const char *iface;
1013         int i;
1014         int ret;
1015
1016         tmp = strdup(ifaces);
1017         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1018                 if (!ctdb_sys_check_iface_exists(iface)) {
1019                         DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1020                         free(tmp);
1021                         return -1;
1022                 }
1023         }
1024         free(tmp);
1025
1026         /* Verify that we don't have an entry for this ip yet */
1027         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1028                 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1029                         DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
1030                                 ctdb_addr_to_str(addr)));
1031                         return -1;
1032                 }               
1033         }
1034
1035         /* create a new vnn structure for this ip address */
1036         vnn = talloc_zero(ctdb, struct ctdb_vnn);
1037         CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1038         vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1039         tmp = talloc_strdup(vnn, ifaces);
1040         CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1041         for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1042                 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1043                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1044                 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1045                 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1046                 num++;
1047         }
1048         talloc_free(tmp);
1049         vnn->ifaces[num] = NULL;
1050         vnn->public_address      = *addr;
1051         vnn->public_netmask_bits = mask;
1052         vnn->pnn                 = -1;
1053         if (check_address) {
1054                 if (ctdb_sys_have_ip(addr)) {
1055                         DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1056                         vnn->pnn = ctdb->pnn;
1057                 }
1058         }
1059
1060         for (i=0; vnn->ifaces[i]; i++) {
1061                 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1062                 if (ret != 0) {
1063                         DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1064                                            "for public_address[%s]\n",
1065                                            vnn->ifaces[i], ctdb_addr_to_str(addr)));
1066                         talloc_free(vnn);
1067                         return -1;
1068                 }
1069         }
1070
1071         DLIST_ADD(ctdb->vnn, vnn);
1072
1073         return 0;
1074 }
1075
1076 /*
1077   setup the public address lists from a file
1078 */
1079 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1080 {
1081         char **lines;
1082         int nlines;
1083         int i;
1084
1085         lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1086         if (lines == NULL) {
1087                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1088                 return -1;
1089         }
1090         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1091                 nlines--;
1092         }
1093
1094         for (i=0;i<nlines;i++) {
1095                 unsigned mask;
1096                 ctdb_sock_addr addr;
1097                 const char *addrstr;
1098                 const char *ifaces;
1099                 char *tok, *line;
1100
1101                 line = lines[i];
1102                 while ((*line == ' ') || (*line == '\t')) {
1103                         line++;
1104                 }
1105                 if (*line == '#') {
1106                         continue;
1107                 }
1108                 if (strcmp(line, "") == 0) {
1109                         continue;
1110                 }
1111                 tok = strtok(line, " \t");
1112                 addrstr = tok;
1113                 tok = strtok(NULL, " \t");
1114                 if (tok == NULL) {
1115                         if (NULL == ctdb->default_public_interface) {
1116                                 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1117                                          i+1));
1118                                 talloc_free(lines);
1119                                 return -1;
1120                         }
1121                         ifaces = ctdb->default_public_interface;
1122                 } else {
1123                         ifaces = tok;
1124                 }
1125
1126                 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1127                         DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1128                         talloc_free(lines);
1129                         return -1;
1130                 }
1131                 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1132                         DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1133                         talloc_free(lines);
1134                         return -1;
1135                 }
1136         }
1137
1138
1139         talloc_free(lines);
1140         return 0;
1141 }
1142
1143 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1144                               const char *iface,
1145                               const char *ip)
1146 {
1147         struct ctdb_vnn *svnn;
1148         struct ctdb_interface *cur = NULL;
1149         bool ok;
1150         int ret;
1151
1152         svnn = talloc_zero(ctdb, struct ctdb_vnn);
1153         CTDB_NO_MEMORY(ctdb, svnn);
1154
1155         svnn->ifaces = talloc_array(svnn, const char *, 2);
1156         CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1157         svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1158         CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1159         svnn->ifaces[1] = NULL;
1160
1161         ok = parse_ip(ip, iface, 0, &svnn->public_address);
1162         if (!ok) {
1163                 talloc_free(svnn);
1164                 return -1;
1165         }
1166
1167         ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1168         if (ret != 0) {
1169                 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1170                                    "for single_ip[%s]\n",
1171                                    svnn->ifaces[0],
1172                                    ctdb_addr_to_str(&svnn->public_address)));
1173                 talloc_free(svnn);
1174                 return -1;
1175         }
1176
1177         /* assume the single public ip interface is initially "good" */
1178         cur = ctdb_find_iface(ctdb, iface);
1179         if (cur == NULL) {
1180                 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1181                 return -1;
1182         }
1183         cur->link_up = true;
1184
1185         ret = ctdb_vnn_assign_iface(ctdb, svnn);
1186         if (ret != 0) {
1187                 talloc_free(svnn);
1188                 return -1;
1189         }
1190
1191         ctdb->single_ip_vnn = svnn;
1192         return 0;
1193 }
1194
1195 static void *add_ip_callback(void *parm, void *data)
1196 {
1197         struct public_ip_list *this_ip = parm;
1198         struct public_ip_list *prev_ip = data;
1199
1200         if (prev_ip == NULL) {
1201                 return parm;
1202         }
1203         if (this_ip->pnn == -1) {
1204                 this_ip->pnn = prev_ip->pnn;
1205         }
1206
1207         return parm;
1208 }
1209
1210 static int getips_count_callback(void *param, void *data)
1211 {
1212         struct public_ip_list **ip_list = (struct public_ip_list **)param;
1213         struct public_ip_list *new_ip = (struct public_ip_list *)data;
1214
1215         new_ip->next = *ip_list;
1216         *ip_list     = new_ip;
1217         return 0;
1218 }
1219
1220 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1221                                        struct ctdb_public_ip_list *ips,
1222                                        uint32_t pnn);
1223
1224 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1225                                          struct ipalloc_state *ipalloc_state,
1226                                          struct ctdb_node_map_old *nodemap)
1227 {
1228         int j;
1229         int ret;
1230         struct ctdb_public_ip_list_old *ip_list;
1231
1232         if (ipalloc_state->num != nodemap->num) {
1233                 DEBUG(DEBUG_ERR,
1234                       (__location__
1235                        " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1236                        ipalloc_state->num, nodemap->num));
1237                 return -1;
1238         }
1239
1240         for (j=0; j<nodemap->num; j++) {
1241                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1242                         continue;
1243                 }
1244
1245                 /* Retrieve the list of known public IPs from the node */
1246                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1247                                         TAKEOVER_TIMEOUT(),
1248                                         j,
1249                                         ipalloc_state->known_public_ips,
1250                                         0,
1251                                         &ip_list);
1252                 if (ret != 0) {
1253                         DEBUG(DEBUG_ERR,
1254                               ("Failed to read known public IPs from node: %u\n",
1255                                j));
1256                         return -1;
1257                 }
1258                 ipalloc_state->known_public_ips[j].num = ip_list->num;
1259                 /* This could be copied and freed.  However, ip_list
1260                  * is allocated off ipalloc_state->known_public_ips,
1261                  * so this is a safe hack.  This will go away in a
1262                  * while anyway... */
1263                 ipalloc_state->known_public_ips[j].ip = &ip_list->ips[0];
1264
1265                 if (ctdb->do_checkpublicip) {
1266                         verify_remote_ip_allocation(
1267                                 ctdb,
1268                                 &ipalloc_state->known_public_ips[j],
1269                                 j);
1270                 }
1271
1272                 /* Retrieve the list of available public IPs from the node */
1273                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1274                                         TAKEOVER_TIMEOUT(),
1275                                         j,
1276                                         ipalloc_state->available_public_ips,
1277                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1278                                         &ip_list);
1279                 if (ret != 0) {
1280                         DEBUG(DEBUG_ERR,
1281                               ("Failed to read available public IPs from node: %u\n",
1282                                j));
1283                         return -1;
1284                 }
1285                 ipalloc_state->available_public_ips[j].num = ip_list->num;
1286                 /* This could be copied and freed.  However, ip_list
1287                  * is allocated off ipalloc_state->available_public_ips,
1288                  * so this is a safe hack.  This will go away in a
1289                  * while anyway... */
1290                 ipalloc_state->available_public_ips[j].ip = &ip_list->ips[0];
1291         }
1292
1293         return 0;
1294 }
1295
1296 static struct public_ip_list *
1297 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1298 {
1299         int i, j;
1300         struct public_ip_list *ip_list;
1301         struct ctdb_public_ip_list *public_ips;
1302
1303         TALLOC_FREE(ctdb->ip_tree);
1304         ctdb->ip_tree = trbt_create(ctdb, 0);
1305
1306         for (i=0; i < ctdb->num_nodes; i++) {
1307
1308                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1309                         continue;
1310                 }
1311
1312                 /* there were no public ips for this node */
1313                 if (ipalloc_state->known_public_ips == NULL) {
1314                         continue;
1315                 }
1316
1317                 public_ips = &ipalloc_state->known_public_ips[i];
1318
1319                 for (j=0; j < public_ips->num; j++) {
1320                         struct public_ip_list *tmp_ip;
1321
1322                         tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1323                         CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1324                         /* Do not use information about IP addresses hosted
1325                          * on other nodes, it may not be accurate */
1326                         if (public_ips->ip[j].pnn == ctdb->nodes[i]->pnn) {
1327                                 tmp_ip->pnn = public_ips->ip[j].pnn;
1328                         } else {
1329                                 tmp_ip->pnn = -1;
1330                         }
1331                         tmp_ip->addr = public_ips->ip[j].addr;
1332                         tmp_ip->next = NULL;
1333
1334                         trbt_insertarray32_callback(ctdb->ip_tree,
1335                                 IP_KEYLEN, ip_key(&public_ips->ip[j].addr),
1336                                 add_ip_callback,
1337                                 tmp_ip);
1338                 }
1339         }
1340
1341         ip_list = NULL;
1342         trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1343
1344         return ip_list;
1345 }
1346
1347 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
1348 {
1349         int i;
1350
1351         for (i=0;i<nodemap->num;i++) {
1352                 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1353                         /* Found one completely healthy node */
1354                         return false;
1355                 }
1356         }
1357
1358         return true;
1359 }
1360
1361 struct get_tunable_callback_data {
1362         const char *tunable;
1363         uint32_t *out;
1364         bool fatal;
1365 };
1366
1367 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
1368                                  int32_t res, TDB_DATA outdata,
1369                                  void *callback)
1370 {
1371         struct get_tunable_callback_data *cd =
1372                 (struct get_tunable_callback_data *)callback;
1373         int size;
1374
1375         if (res != 0) {
1376                 /* Already handled in fail callback */
1377                 return;
1378         }
1379
1380         if (outdata.dsize != sizeof(uint32_t)) {
1381                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
1382                                  cd->tunable, pnn, (int)sizeof(uint32_t),
1383                                  (int)outdata.dsize));
1384                 cd->fatal = true;
1385                 return;
1386         }
1387
1388         size = talloc_array_length(cd->out);
1389         if (pnn >= size) {
1390                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
1391                                  cd->tunable, pnn, size));
1392                 return;
1393         }
1394
1395                 
1396         cd->out[pnn] = *(uint32_t *)outdata.dptr;
1397 }
1398
1399 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1400                                        int32_t res, TDB_DATA outdata,
1401                                        void *callback)
1402 {
1403         struct get_tunable_callback_data *cd =
1404                 (struct get_tunable_callback_data *)callback;
1405
1406         switch (res) {
1407         case -ETIME:
1408                 DEBUG(DEBUG_ERR,
1409                       ("Timed out getting tunable \"%s\" from node %d\n",
1410                        cd->tunable, pnn));
1411                 cd->fatal = true;
1412                 break;
1413         case -EINVAL:
1414         case -1:
1415                 DEBUG(DEBUG_WARNING,
1416                       ("Tunable \"%s\" not implemented on node %d\n",
1417                        cd->tunable, pnn));
1418                 break;
1419         default:
1420                 DEBUG(DEBUG_ERR,
1421                       ("Unexpected error getting tunable \"%s\" from node %d\n",
1422                        cd->tunable, pnn));
1423                 cd->fatal = true;
1424         }
1425 }
1426
1427 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
1428                                         TALLOC_CTX *tmp_ctx,
1429                                         struct ctdb_node_map_old *nodemap,
1430                                         const char *tunable,
1431                                         uint32_t default_value)
1432 {
1433         TDB_DATA data;
1434         struct ctdb_control_get_tunable *t;
1435         uint32_t *nodes;
1436         uint32_t *tvals;
1437         struct get_tunable_callback_data callback_data;
1438         int i;
1439
1440         tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1441         CTDB_NO_MEMORY_NULL(ctdb, tvals);
1442         for (i=0; i<nodemap->num; i++) {
1443                 tvals[i] = default_value;
1444         }
1445                 
1446         callback_data.out = tvals;
1447         callback_data.tunable = tunable;
1448         callback_data.fatal = false;
1449
1450         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
1451         data.dptr  = talloc_size(tmp_ctx, data.dsize);
1452         t = (struct ctdb_control_get_tunable *)data.dptr;
1453         t->length = strlen(tunable)+1;
1454         memcpy(t->name, tunable, t->length);
1455         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1456         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
1457                                       nodes, 0, TAKEOVER_TIMEOUT(),
1458                                       false, data,
1459                                       get_tunable_callback,
1460                                       get_tunable_fail_callback,
1461                                       &callback_data) != 0) {
1462                 if (callback_data.fatal) {
1463                         talloc_free(tvals);
1464                         tvals = NULL;
1465                 }
1466         }
1467         talloc_free(nodes);
1468         talloc_free(data.dptr);
1469
1470         return tvals;
1471 }
1472
1473 /* Set internal flags for IP allocation:
1474  *   Clear ip flags
1475  *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
1476  *   Set NOIPHOST ip flag for each INACTIVE node
1477  *   if all nodes are disabled:
1478  *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
1479  *   else
1480  *     Set NOIPHOST ip flags for disabled nodes
1481  */
1482 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
1483                                  struct ctdb_node_map_old *nodemap,
1484                                  uint32_t *tval_noiptakeover,
1485                                  uint32_t *tval_noiphostonalldisabled)
1486 {
1487         int i;
1488
1489         for (i=0;i<nodemap->num;i++) {
1490                 /* Can not take IPs on node with NoIPTakeover set */
1491                 if (tval_noiptakeover[i] != 0) {
1492                         ipalloc_state->noiptakeover[i] = true;
1493                 }
1494
1495                 /* Can not host IPs on INACTIVE node */
1496                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1497                         ipalloc_state->noiphost[i] = true;
1498                 }
1499         }
1500
1501         if (all_nodes_are_disabled(nodemap)) {
1502                 /* If all nodes are disabled, can not host IPs on node
1503                  * with NoIPHostOnAllDisabled set
1504                  */
1505                 for (i=0;i<nodemap->num;i++) {
1506                         if (tval_noiphostonalldisabled[i] != 0) {
1507                                 ipalloc_state->noiphost[i] = true;
1508                         }
1509                 }
1510         } else {
1511                 /* If some nodes are not disabled, then can not host
1512                  * IPs on DISABLED node
1513                  */
1514                 for (i=0;i<nodemap->num;i++) {
1515                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
1516                                 ipalloc_state->noiphost[i] = true;
1517                         }
1518                 }
1519         }
1520 }
1521
1522 static bool set_ipflags(struct ctdb_context *ctdb,
1523                         struct ipalloc_state *ipalloc_state,
1524                         struct ctdb_node_map_old *nodemap)
1525 {
1526         uint32_t *tval_noiptakeover;
1527         uint32_t *tval_noiphostonalldisabled;
1528
1529         tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1530                                                    "NoIPTakeover", 0);
1531         if (tval_noiptakeover == NULL) {
1532                 return false;
1533         }
1534
1535         tval_noiphostonalldisabled =
1536                 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1537                                        "NoIPHostOnAllDisabled", 0);
1538         if (tval_noiphostonalldisabled == NULL) {
1539                 /* Caller frees tmp_ctx */
1540                 return false;
1541         }
1542
1543         set_ipflags_internal(ipalloc_state, nodemap,
1544                              tval_noiptakeover,
1545                              tval_noiphostonalldisabled);
1546
1547         talloc_free(tval_noiptakeover);
1548         talloc_free(tval_noiphostonalldisabled);
1549
1550         return true;
1551 }
1552
1553 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
1554                                                  TALLOC_CTX *mem_ctx)
1555 {
1556         struct ipalloc_state *ipalloc_state =
1557                 talloc_zero(mem_ctx, struct ipalloc_state);
1558         if (ipalloc_state == NULL) {
1559                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1560                 return NULL;
1561         }
1562
1563         ipalloc_state->num = ctdb->num_nodes;
1564
1565         ipalloc_state->known_public_ips =
1566                 talloc_zero_array(ipalloc_state,
1567                                   struct ctdb_public_ip_list,
1568                                   ipalloc_state->num);
1569         if (ipalloc_state->known_public_ips == NULL) {
1570                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1571                 goto fail;
1572         }
1573
1574         ipalloc_state->available_public_ips =
1575                 talloc_zero_array(ipalloc_state,
1576                                   struct ctdb_public_ip_list,
1577                                   ipalloc_state->num);
1578         if (ipalloc_state->available_public_ips == NULL) {
1579                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1580                 goto fail;
1581         }
1582         ipalloc_state->noiptakeover =
1583                 talloc_zero_array(ipalloc_state,
1584                                   bool,
1585                                   ipalloc_state->num);
1586         if (ipalloc_state->noiptakeover == NULL) {
1587                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1588                 goto fail;
1589         }
1590         ipalloc_state->noiphost =
1591                 talloc_zero_array(ipalloc_state,
1592                                   bool,
1593                                   ipalloc_state->num);
1594         if (ipalloc_state->noiphost == NULL) {
1595                 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1596                 goto fail;
1597         }
1598
1599         if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1600                 ipalloc_state->algorithm = IPALLOC_LCP2;
1601         } else if (1 == ctdb->tunable.deterministic_public_ips) {
1602                 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
1603         } else {
1604                 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
1605         }
1606
1607         ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
1608
1609         return ipalloc_state;
1610 fail:
1611         talloc_free(ipalloc_state);
1612         return NULL;
1613 }
1614
1615 struct iprealloc_callback_data {
1616         bool *retry_nodes;
1617         int retry_count;
1618         client_async_callback fail_callback;
1619         void *fail_callback_data;
1620         struct ctdb_node_map_old *nodemap;
1621 };
1622
1623 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1624                                         int32_t res, TDB_DATA outdata,
1625                                         void *callback)
1626 {
1627         int numnodes;
1628         struct iprealloc_callback_data *cd =
1629                 (struct iprealloc_callback_data *)callback;
1630
1631         numnodes = talloc_array_length(cd->retry_nodes);
1632         if (pnn > numnodes) {
1633                 DEBUG(DEBUG_ERR,
1634                       ("ipreallocated failure from node %d, "
1635                        "but only %d nodes in nodemap\n",
1636                        pnn, numnodes));
1637                 return;
1638         }
1639
1640         /* Can't run the "ipreallocated" event on a INACTIVE node */
1641         if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
1642                 DEBUG(DEBUG_WARNING,
1643                       ("ipreallocated failed on inactive node %d, ignoring\n",
1644                        pnn));
1645                 return;
1646         }
1647
1648         switch (res) {
1649         case -ETIME:
1650                 /* If the control timed out then that's a real error,
1651                  * so call the real fail callback
1652                  */
1653                 if (cd->fail_callback) {
1654                         cd->fail_callback(ctdb, pnn, res, outdata,
1655                                           cd->fail_callback_data);
1656                 } else {
1657                         DEBUG(DEBUG_WARNING,
1658                               ("iprealloc timed out but no callback registered\n"));
1659                 }
1660                 break;
1661         default:
1662                 /* If not a timeout then either the ipreallocated
1663                  * eventscript (or some setup) failed.  This might
1664                  * have failed because the IPREALLOCATED control isn't
1665                  * implemented - right now there is no way of knowing
1666                  * because the error codes are all folded down to -1.
1667                  * Consider retrying using EVENTSCRIPT control...
1668                  */
1669                 DEBUG(DEBUG_WARNING,
1670                       ("ipreallocated failure from node %d, flagging retry\n",
1671                        pnn));
1672                 cd->retry_nodes[pnn] = true;
1673                 cd->retry_count++;
1674         }
1675 }
1676
1677 struct takeover_callback_data {
1678         bool *node_failed;
1679         client_async_callback fail_callback;
1680         void *fail_callback_data;
1681         struct ctdb_node_map_old *nodemap;
1682 };
1683
1684 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1685                                        uint32_t node_pnn, int32_t res,
1686                                        TDB_DATA outdata, void *callback_data)
1687 {
1688         struct takeover_callback_data *cd =
1689                 talloc_get_type_abort(callback_data,
1690                                       struct takeover_callback_data);
1691         int i;
1692
1693         for (i = 0; i < cd->nodemap->num; i++) {
1694                 if (node_pnn == cd->nodemap->nodes[i].pnn) {
1695                         break;
1696                 }
1697         }
1698
1699         if (i == cd->nodemap->num) {
1700                 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1701                 return;
1702         }
1703
1704         if (!cd->node_failed[i]) {
1705                 cd->node_failed[i] = true;
1706                 cd->fail_callback(ctdb, node_pnn, res, outdata,
1707                                   cd->fail_callback_data);
1708         }
1709 }
1710
1711 /*
1712  * Recalculate the allocation of public IPs to nodes and have the
1713  * nodes host their allocated addresses.
1714  *
1715  * - Allocate memory for IP allocation state, including per node
1716  *   arrays
1717  * - Populate IP allocation algorithm in IP allocation state
1718  * - Populate local value of tunable NoIPFailback in IP allocation
1719      state - this is really a cluster-wide configuration variable and
1720      only the value form the master node is used
1721  * - Retrieve tunables NoIPTakeover and NoIPHostOnAllDisabled from all
1722  *   connected nodes - this is done separately so tunable values can
1723  *   be faked in unit testing
1724  * - Populate NoIPTakover tunable in IP allocation state
1725  * - Populate NoIPHost in IP allocation state, derived from node flags
1726  *   and NoIPHostOnAllDisabled tunable
1727  * - Retrieve and populate known and available IP lists in IP
1728  *   allocation state
1729  * - If no available IP addresses then early exit
1730  * - Build list of (known IPs, currently assigned node)
1731  * - Populate list of nodes to force rebalance - internal structure,
1732  *   currently no way to fetch, only used by LCP2 for nodes that have
1733  *   had new IP addresses added
1734  * - Run IP allocation algorithm
1735  * - Send RELEASE_IP to all nodes for IPs they should not host
1736  * - Send TAKE_IP to all nodes for IPs they should host
1737  * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
1738  */
1739 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1740                       uint32_t *force_rebalance_nodes,
1741                       client_async_callback fail_callback, void *callback_data)
1742 {
1743         int i, j, ret;
1744         struct ctdb_public_ip ip;
1745         uint32_t *nodes;
1746         struct public_ip_list *all_ips, *tmp_ip;
1747         TDB_DATA data;
1748         struct timeval timeout;
1749         struct client_async_data *async_data;
1750         struct ctdb_client_control_state *state;
1751         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1752         struct ipalloc_state *ipalloc_state;
1753         struct takeover_callback_data *takeover_data;
1754         struct iprealloc_callback_data iprealloc_data;
1755         bool *retry_data;
1756         bool can_host_ips;
1757
1758         /*
1759          * ip failover is completely disabled, just send out the 
1760          * ipreallocated event.
1761          */
1762         if (ctdb->tunable.disable_ip_failover != 0) {
1763                 goto ipreallocated;
1764         }
1765
1766         ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
1767         if (ipalloc_state == NULL) {
1768                 talloc_free(tmp_ctx);
1769                 return -1;
1770         }
1771
1772         if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1773                 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
1774                 talloc_free(tmp_ctx);
1775                 return -1;
1776         }
1777
1778         /* Fetch known/available public IPs from each active node */
1779         ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
1780         if (ret != 0) {
1781                 talloc_free(tmp_ctx);
1782                 return -1;
1783         }
1784
1785         /* Short-circuit IP allocation if no node has available IPs */
1786         can_host_ips = false;
1787         for (i=0; i < ipalloc_state->num; i++) {
1788                 if (ipalloc_state->available_public_ips[i].num != 0) {
1789                         can_host_ips = true;
1790                 }
1791         }
1792         if (!can_host_ips) {
1793                 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1794                 return 0;
1795         }
1796
1797         /* since nodes only know about those public addresses that
1798            can be served by that particular node, no single node has
1799            a full list of all public addresses that exist in the cluster.
1800            Walk over all node structures and create a merged list of
1801            all public addresses that exist in the cluster.
1802
1803            keep the tree of ips around as ctdb->ip_tree
1804         */
1805         all_ips = create_merged_ip_list(ctdb, ipalloc_state);
1806         ipalloc_state->all_ips = all_ips;
1807
1808         ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
1809
1810         /* Do the IP reassignment calculations */
1811         ipalloc(ipalloc_state);
1812
1813         /* Now tell all nodes to release any public IPs should not
1814          * host.  This will be a NOOP on nodes that don't currently
1815          * hold the given IP.
1816          */
1817         takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
1818         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
1819
1820         takeover_data->node_failed = talloc_zero_array(tmp_ctx,
1821                                                        bool, nodemap->num);
1822         CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
1823         takeover_data->fail_callback = fail_callback;
1824         takeover_data->fail_callback_data = callback_data;
1825         takeover_data->nodemap = nodemap;
1826
1827         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1828         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1829
1830         async_data->fail_callback = takeover_run_fail_callback;
1831         async_data->callback_data = takeover_data;
1832
1833         ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1834
1835         /* Send a RELEASE_IP to all nodes that should not be hosting
1836          * each IP.  For each IP, all but one of these will be
1837          * redundant.  However, the redundant ones are used to tell
1838          * nodes which node should be hosting the IP so that commands
1839          * like "ctdb ip" can display a particular nodes idea of who
1840          * is hosting what. */
1841         for (i=0;i<nodemap->num;i++) {
1842                 /* don't talk to unconnected nodes, but do talk to banned nodes */
1843                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1844                         continue;
1845                 }
1846
1847                 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1848                         if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1849                                 /* This node should be serving this
1850                                    vnn so don't tell it to release the ip
1851                                 */
1852                                 continue;
1853                         }
1854                         ip.pnn  = tmp_ip->pnn;
1855                         ip.addr = tmp_ip->addr;
1856
1857                         timeout = TAKEOVER_TIMEOUT();
1858                         data.dsize = sizeof(ip);
1859                         data.dptr  = (uint8_t *)&ip;
1860                         state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1861                                                   0, CTDB_CONTROL_RELEASE_IP, 0,
1862                                                   data, async_data,
1863                                                   &timeout, NULL);
1864                         if (state == NULL) {
1865                                 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1866                                 talloc_free(tmp_ctx);
1867                                 return -1;
1868                         }
1869
1870                         ctdb_client_async_add(async_data, state);
1871                 }
1872         }
1873         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1874                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1875                 talloc_free(tmp_ctx);
1876                 return -1;
1877         }
1878         talloc_free(async_data);
1879
1880
1881         /* For each IP, send a TAKOVER_IP to the node that should be
1882          * hosting it.  Many of these will often be redundant (since
1883          * the allocation won't have changed) but they can be useful
1884          * to recover from inconsistencies. */
1885         async_data = talloc_zero(tmp_ctx, struct client_async_data);
1886         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1887
1888         async_data->fail_callback = fail_callback;
1889         async_data->callback_data = callback_data;
1890
1891         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1892                 if (tmp_ip->pnn == -1) {
1893                         /* this IP won't be taken over */
1894                         continue;
1895                 }
1896
1897                 ip.pnn  = tmp_ip->pnn;
1898                 ip.addr = tmp_ip->addr;
1899
1900                 timeout = TAKEOVER_TIMEOUT();
1901                 data.dsize = sizeof(ip);
1902                 data.dptr  = (uint8_t *)&ip;
1903                 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1904                                           0, CTDB_CONTROL_TAKEOVER_IP, 0,
1905                                           data, async_data, &timeout, NULL);
1906                 if (state == NULL) {
1907                         DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1908                         talloc_free(tmp_ctx);
1909                         return -1;
1910                 }
1911
1912                 ctdb_client_async_add(async_data, state);
1913         }
1914         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1915                 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1916                 talloc_free(tmp_ctx);
1917                 return -1;
1918         }
1919
1920 ipreallocated:
1921         /*
1922          * Tell all nodes to run eventscripts to process the
1923          * "ipreallocated" event.  This can do a lot of things,
1924          * including restarting services to reconfigure them if public
1925          * IPs have moved.  Once upon a time this event only used to
1926          * update natgw.
1927          */
1928         retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
1929         CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
1930         iprealloc_data.retry_nodes = retry_data;
1931         iprealloc_data.retry_count = 0;
1932         iprealloc_data.fail_callback = fail_callback;
1933         iprealloc_data.fail_callback_data = callback_data;
1934         iprealloc_data.nodemap = nodemap;
1935
1936         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1937         ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1938                                         nodes, 0, TAKEOVER_TIMEOUT(),
1939                                         false, tdb_null,
1940                                         NULL, iprealloc_fail_callback,
1941                                         &iprealloc_data);
1942         if (ret != 0) {
1943                 /* If the control failed then we should retry to any
1944                  * nodes flagged by iprealloc_fail_callback using the
1945                  * EVENTSCRIPT control.  This is a best-effort at
1946                  * backward compatiblity when running a mixed cluster
1947                  * where some nodes have not yet been upgraded to
1948                  * support the IPREALLOCATED control.
1949                  */
1950                 DEBUG(DEBUG_WARNING,
1951                       ("Retry ipreallocated to some nodes using eventscript control\n"));
1952
1953                 nodes = talloc_array(tmp_ctx, uint32_t,
1954                                      iprealloc_data.retry_count);
1955                 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
1956
1957                 j = 0;
1958                 for (i=0; i<nodemap->num; i++) {
1959                         if (iprealloc_data.retry_nodes[i]) {
1960                                 nodes[j] = i;
1961                                 j++;
1962                         }
1963                 }
1964
1965                 data.dptr  = discard_const("ipreallocated");
1966                 data.dsize = strlen((char *)data.dptr) + 1; 
1967                 ret = ctdb_client_async_control(ctdb,
1968                                                 CTDB_CONTROL_RUN_EVENTSCRIPTS,
1969                                                 nodes, 0, TAKEOVER_TIMEOUT(),
1970                                                 false, data,
1971                                                 NULL, fail_callback,
1972                                                 callback_data);
1973                 if (ret != 0) {
1974                         DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
1975                 }
1976         }
1977
1978         talloc_free(tmp_ctx);
1979         return ret;
1980 }
1981
1982
1983 /*
1984   destroy a ctdb_client_ip structure
1985  */
1986 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1987 {
1988         DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1989                 ctdb_addr_to_str(&ip->addr),
1990                 ntohs(ip->addr.ip.sin_port),
1991                 ip->client_id));
1992
1993         DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1994         return 0;
1995 }
1996
1997 /*
1998   called by a client to inform us of a TCP connection that it is managing
1999   that should tickled with an ACK when IP takeover is done
2000  */
2001 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2002                                 TDB_DATA indata)
2003 {
2004         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2005         struct ctdb_connection *tcp_sock = NULL;
2006         struct ctdb_tcp_list *tcp;
2007         struct ctdb_connection t;
2008         int ret;
2009         TDB_DATA data;
2010         struct ctdb_client_ip *ip;
2011         struct ctdb_vnn *vnn;
2012         ctdb_sock_addr addr;
2013
2014         /* If we don't have public IPs, tickles are useless */
2015         if (ctdb->vnn == NULL) {
2016                 return 0;
2017         }
2018
2019         tcp_sock = (struct ctdb_connection *)indata.dptr;
2020
2021         addr = tcp_sock->src;
2022         ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
2023         addr = tcp_sock->dst;
2024         ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2025
2026         ZERO_STRUCT(addr);
2027         memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2028         vnn = find_public_ip_vnn(ctdb, &addr);
2029         if (vnn == NULL) {
2030                 switch (addr.sa.sa_family) {
2031                 case AF_INET:
2032                         if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2033                                 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
2034                                         ctdb_addr_to_str(&addr)));
2035                         }
2036                         break;
2037                 case AF_INET6:
2038                         DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
2039                                 ctdb_addr_to_str(&addr)));
2040                         break;
2041                 default:
2042                         DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2043                 }
2044
2045                 return 0;
2046         }
2047
2048         if (vnn->pnn != ctdb->pnn) {
2049                 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2050                         ctdb_addr_to_str(&addr),
2051                         client_id, client->pid));
2052                 /* failing this call will tell smbd to die */
2053                 return -1;
2054         }
2055
2056         ip = talloc(client, struct ctdb_client_ip);
2057         CTDB_NO_MEMORY(ctdb, ip);
2058
2059         ip->ctdb      = ctdb;
2060         ip->addr      = addr;
2061         ip->client_id = client_id;
2062         talloc_set_destructor(ip, ctdb_client_ip_destructor);
2063         DLIST_ADD(ctdb->client_ip_list, ip);
2064
2065         tcp = talloc(client, struct ctdb_tcp_list);
2066         CTDB_NO_MEMORY(ctdb, tcp);
2067
2068         tcp->connection.src = tcp_sock->src;
2069         tcp->connection.dst = tcp_sock->dst;
2070
2071         DLIST_ADD(client->tcp_list, tcp);
2072
2073         t.src = tcp_sock->src;
2074         t.dst = tcp_sock->dst;
2075
2076         data.dptr = (uint8_t *)&t;
2077         data.dsize = sizeof(t);
2078
2079         switch (addr.sa.sa_family) {
2080         case AF_INET:
2081                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2082                         (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2083                         ctdb_addr_to_str(&tcp_sock->src),
2084                         (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2085                 break;
2086         case AF_INET6:
2087                 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2088                         (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2089                         ctdb_addr_to_str(&tcp_sock->src),
2090                         (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2091                 break;
2092         default:
2093                 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2094         }
2095
2096
2097         /* tell all nodes about this tcp connection */
2098         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
2099                                        CTDB_CONTROL_TCP_ADD,
2100                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2101         if (ret != 0) {
2102                 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2103                 return -1;
2104         }
2105
2106         return 0;
2107 }
2108
2109 /*
2110   find a tcp address on a list
2111  */
2112 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2113                                            struct ctdb_connection *tcp)
2114 {
2115         int i;
2116
2117         if (array == NULL) {
2118                 return NULL;
2119         }
2120
2121         for (i=0;i<array->num;i++) {
2122                 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2123                     ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2124                         return &array->connections[i];
2125                 }
2126         }
2127         return NULL;
2128 }
2129
2130
2131
2132 /*
2133   called by a daemon to inform us of a TCP connection that one of its
2134   clients managing that should tickled with an ACK when IP takeover is
2135   done
2136  */
2137 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2138 {
2139         struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2140         struct ctdb_tcp_array *tcparray;
2141         struct ctdb_connection tcp;
2142         struct ctdb_vnn *vnn;
2143
2144         /* If we don't have public IPs, tickles are useless */
2145         if (ctdb->vnn == NULL) {
2146                 return 0;
2147         }
2148
2149         vnn = find_public_ip_vnn(ctdb, &p->dst);
2150         if (vnn == NULL) {
2151                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2152                         ctdb_addr_to_str(&p->dst)));
2153
2154                 return -1;
2155         }
2156
2157
2158         tcparray = vnn->tcp_array;
2159
2160         /* If this is the first tickle */
2161         if (tcparray == NULL) {
2162                 tcparray = talloc(vnn, struct ctdb_tcp_array);
2163                 CTDB_NO_MEMORY(ctdb, tcparray);
2164                 vnn->tcp_array = tcparray;
2165
2166                 tcparray->num = 0;
2167                 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2168                 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2169
2170                 tcparray->connections[tcparray->num].src = p->src;
2171                 tcparray->connections[tcparray->num].dst = p->dst;
2172                 tcparray->num++;
2173
2174                 if (tcp_update_needed) {
2175                         vnn->tcp_update_needed = true;
2176                 }
2177                 return 0;
2178         }
2179
2180
2181         /* Do we already have this tickle ?*/
2182         tcp.src = p->src;
2183         tcp.dst = p->dst;
2184         if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2185                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2186                         ctdb_addr_to_str(&tcp.dst),
2187                         ntohs(tcp.dst.ip.sin_port),
2188                         vnn->pnn));
2189                 return 0;
2190         }
2191
2192         /* A new tickle, we must add it to the array */
2193         tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2194                                         struct ctdb_connection,
2195                                         tcparray->num+1);
2196         CTDB_NO_MEMORY(ctdb, tcparray->connections);
2197
2198         tcparray->connections[tcparray->num].src = p->src;
2199         tcparray->connections[tcparray->num].dst = p->dst;
2200         tcparray->num++;
2201
2202         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2203                 ctdb_addr_to_str(&tcp.dst),
2204                 ntohs(tcp.dst.ip.sin_port),
2205                 vnn->pnn));
2206
2207         if (tcp_update_needed) {
2208                 vnn->tcp_update_needed = true;
2209         }
2210
2211         return 0;
2212 }
2213
2214
2215 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
2216 {
2217         struct ctdb_connection *tcpp;
2218
2219         if (vnn == NULL) {
2220                 return;
2221         }
2222
2223         /* if the array is empty we cant remove it
2224            and we don't need to do anything
2225          */
2226         if (vnn->tcp_array == NULL) {
2227                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2228                         ctdb_addr_to_str(&conn->dst),
2229                         ntohs(conn->dst.ip.sin_port)));
2230                 return;
2231         }
2232
2233
2234         /* See if we know this connection
2235            if we don't know this connection  then we dont need to do anything
2236          */
2237         tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2238         if (tcpp == NULL) {
2239                 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2240                         ctdb_addr_to_str(&conn->dst),
2241                         ntohs(conn->dst.ip.sin_port)));
2242                 return;
2243         }
2244
2245
2246         /* We need to remove this entry from the array.
2247            Instead of allocating a new array and copying data to it
2248            we cheat and just copy the last entry in the existing array
2249            to the entry that is to be removed and just shring the 
2250            ->num field
2251          */
2252         *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2253         vnn->tcp_array->num--;
2254
2255         /* If we deleted the last entry we also need to remove the entire array
2256          */
2257         if (vnn->tcp_array->num == 0) {
2258                 talloc_free(vnn->tcp_array);
2259                 vnn->tcp_array = NULL;
2260         }               
2261
2262         vnn->tcp_update_needed = true;
2263
2264         DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2265                 ctdb_addr_to_str(&conn->src),
2266                 ntohs(conn->src.ip.sin_port)));
2267 }
2268
2269
2270 /*
2271   called by a daemon to inform us of a TCP connection that one of its
2272   clients used are no longer needed in the tickle database
2273  */
2274 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2275 {
2276         struct ctdb_vnn *vnn;
2277         struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
2278
2279         /* If we don't have public IPs, tickles are useless */
2280         if (ctdb->vnn == NULL) {
2281                 return 0;
2282         }
2283
2284         vnn = find_public_ip_vnn(ctdb, &conn->dst);
2285         if (vnn == NULL) {
2286                 DEBUG(DEBUG_ERR,
2287                       (__location__ " unable to find public address %s\n",
2288                        ctdb_addr_to_str(&conn->dst)));
2289                 return 0;
2290         }
2291
2292         ctdb_remove_connection(vnn, conn);
2293
2294         return 0;
2295 }
2296
2297
2298 /*
2299   Called when another daemon starts - causes all tickles for all
2300   public addresses we are serving to be sent to the new node on the
2301   next check.  This actually causes the next scheduled call to
2302   tdb_update_tcp_tickles() to update all nodes.  This is simple and
2303   doesn't require careful error handling.
2304  */
2305 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
2306 {
2307         struct ctdb_vnn *vnn;
2308
2309         DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
2310                            (unsigned long) pnn));
2311
2312         for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
2313                 vnn->tcp_update_needed = true;
2314         }
2315
2316         return 0;
2317 }
2318
2319
2320 /*
2321   called when a client structure goes away - hook to remove
2322   elements from the tcp_list in all daemons
2323  */
2324 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2325 {
2326         while (client->tcp_list) {
2327                 struct ctdb_vnn *vnn;
2328                 struct ctdb_tcp_list *tcp = client->tcp_list;
2329                 struct ctdb_connection *conn = &tcp->connection;
2330
2331                 DLIST_REMOVE(client->tcp_list, tcp);
2332
2333                 vnn = find_public_ip_vnn(client->ctdb,
2334                                          &conn->dst);
2335                 if (vnn == NULL) {
2336                         DEBUG(DEBUG_ERR,
2337                               (__location__ " unable to find public address %s\n",
2338                                ctdb_addr_to_str(&conn->dst)));
2339                         continue;
2340                 }
2341
2342                 /* If the IP address is hosted on this node then
2343                  * remove the connection. */
2344                 if (vnn->pnn == client->ctdb->pnn) {
2345                         ctdb_remove_connection(vnn, conn);
2346                 }
2347
2348                 /* Otherwise this function has been called because the
2349                  * server IP address has been released to another node
2350                  * and the client has exited.  This means that we
2351                  * should not delete the connection information.  The
2352                  * takeover node processes connections too. */
2353         }
2354 }
2355
2356
2357 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2358 {
2359         struct ctdb_vnn *vnn;
2360         int count = 0;
2361         TDB_DATA data;
2362
2363         if (ctdb->tunable.disable_ip_failover == 1) {
2364                 return;
2365         }
2366
2367         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2368                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2369                         ctdb_vnn_unassign_iface(ctdb, vnn);
2370                         continue;
2371                 }
2372                 if (!vnn->iface) {
2373                         continue;
2374                 }
2375
2376                 /* Don't allow multiple releases at once.  Some code,
2377                  * particularly ctdb_tickle_sentenced_connections() is
2378                  * not re-entrant */
2379                 if (vnn->update_in_flight) {
2380                         DEBUG(DEBUG_WARNING,
2381                               (__location__
2382                                " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2383                                     ctdb_addr_to_str(&vnn->public_address),
2384                                     vnn->public_netmask_bits,
2385                                     ctdb_vnn_iface_string(vnn)));
2386                         continue;
2387                 }
2388                 vnn->update_in_flight = true;
2389
2390                 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2391                                     ctdb_addr_to_str(&vnn->public_address),
2392                                     vnn->public_netmask_bits,
2393                                     ctdb_vnn_iface_string(vnn)));
2394
2395                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2396                                   ctdb_vnn_iface_string(vnn),
2397                                   ctdb_addr_to_str(&vnn->public_address),
2398                                   vnn->public_netmask_bits);
2399
2400                 data.dptr = (uint8_t *)talloc_strdup(
2401                                 vnn, ctdb_addr_to_str(&vnn->public_address));
2402                 if (data.dptr != NULL) {
2403                         data.dsize = strlen((char *)data.dptr) + 1;
2404                         ctdb_daemon_send_message(ctdb, ctdb->pnn,
2405                                                  CTDB_SRVID_RELEASE_IP, data);
2406                         talloc_free(data.dptr);
2407                 }
2408
2409                 ctdb_vnn_unassign_iface(ctdb, vnn);
2410                 vnn->update_in_flight = false;
2411                 count++;
2412         }
2413
2414         DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2415 }
2416
2417
2418 /*
2419   get list of public IPs
2420  */
2421 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
2422                                     struct ctdb_req_control_old *c, TDB_DATA *outdata)
2423 {
2424         int i, num, len;
2425         struct ctdb_public_ip_list_old *ips;
2426         struct ctdb_vnn *vnn;
2427         bool only_available = false;
2428
2429         if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2430                 only_available = true;
2431         }
2432
2433         /* count how many public ip structures we have */
2434         num = 0;
2435         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2436                 num++;
2437         }
2438
2439         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2440                 num*sizeof(struct ctdb_public_ip);
2441         ips = talloc_zero_size(outdata, len);
2442         CTDB_NO_MEMORY(ctdb, ips);
2443
2444         i = 0;
2445         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2446                 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2447                         continue;
2448                 }
2449                 ips->ips[i].pnn  = vnn->pnn;
2450                 ips->ips[i].addr = vnn->public_address;
2451                 i++;
2452         }
2453         ips->num = i;
2454         len = offsetof(struct ctdb_public_ip_list_old, ips) +
2455                 i*sizeof(struct ctdb_public_ip);
2456
2457         outdata->dsize = len;
2458         outdata->dptr  = (uint8_t *)ips;
2459
2460         return 0;
2461 }
2462
2463
2464 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2465                                         struct ctdb_req_control_old *c,
2466                                         TDB_DATA indata,
2467                                         TDB_DATA *outdata)
2468 {
2469         int i, num, len;
2470         ctdb_sock_addr *addr;
2471         struct ctdb_public_ip_info_old *info;
2472         struct ctdb_vnn *vnn;
2473
2474         addr = (ctdb_sock_addr *)indata.dptr;
2475
2476         vnn = find_public_ip_vnn(ctdb, addr);
2477         if (vnn == NULL) {
2478                 /* if it is not a public ip   it could be our 'single ip' */
2479                 if (ctdb->single_ip_vnn) {
2480                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2481                                 vnn = ctdb->single_ip_vnn;
2482                         }
2483                 }
2484         }
2485         if (vnn == NULL) {
2486                 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2487                                  "'%s'not a public address\n",
2488                                  ctdb_addr_to_str(addr)));
2489                 return -1;
2490         }
2491
2492         /* count how many public ip structures we have */
2493         num = 0;
2494         for (;vnn->ifaces[num];) {
2495                 num++;
2496         }
2497
2498         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2499                 num*sizeof(struct ctdb_iface);
2500         info = talloc_zero_size(outdata, len);
2501         CTDB_NO_MEMORY(ctdb, info);
2502
2503         info->ip.addr = vnn->public_address;
2504         info->ip.pnn = vnn->pnn;
2505         info->active_idx = 0xFFFFFFFF;
2506
2507         for (i=0; vnn->ifaces[i]; i++) {
2508                 struct ctdb_interface *cur;
2509
2510                 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2511                 if (cur == NULL) {
2512                         DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2513                                            vnn->ifaces[i]));
2514                         return -1;
2515                 }
2516                 if (vnn->iface == cur) {
2517                         info->active_idx = i;
2518                 }
2519                 strncpy(info->ifaces[i].name, cur->name,
2520                         sizeof(info->ifaces[i].name));
2521                 info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0';
2522                 info->ifaces[i].link_state = cur->link_up;
2523                 info->ifaces[i].references = cur->references;
2524         }
2525         info->num = i;
2526         len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2527                 i*sizeof(struct ctdb_iface);
2528
2529         outdata->dsize = len;
2530         outdata->dptr  = (uint8_t *)info;
2531
2532         return 0;
2533 }
2534
2535 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2536                                 struct ctdb_req_control_old *c,
2537                                 TDB_DATA *outdata)
2538 {
2539         int i, num, len;
2540         struct ctdb_iface_list_old *ifaces;
2541         struct ctdb_interface *cur;
2542
2543         /* count how many public ip structures we have */
2544         num = 0;
2545         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2546                 num++;
2547         }
2548
2549         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2550                 num*sizeof(struct ctdb_iface);
2551         ifaces = talloc_zero_size(outdata, len);
2552         CTDB_NO_MEMORY(ctdb, ifaces);
2553
2554         i = 0;
2555         for (cur=ctdb->ifaces;cur;cur=cur->next) {
2556                 strncpy(ifaces->ifaces[i].name, cur->name,
2557                         sizeof(ifaces->ifaces[i].name));
2558                 ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0';
2559                 ifaces->ifaces[i].link_state = cur->link_up;
2560                 ifaces->ifaces[i].references = cur->references;
2561                 i++;
2562         }
2563         ifaces->num = i;
2564         len = offsetof(struct ctdb_iface_list_old, ifaces) +
2565                 i*sizeof(struct ctdb_iface);
2566
2567         outdata->dsize = len;
2568         outdata->dptr  = (uint8_t *)ifaces;
2569
2570         return 0;
2571 }
2572
2573 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2574                                     struct ctdb_req_control_old *c,
2575                                     TDB_DATA indata)
2576 {
2577         struct ctdb_iface *info;
2578         struct ctdb_interface *iface;
2579         bool link_up = false;
2580
2581         info = (struct ctdb_iface *)indata.dptr;
2582
2583         if (info->name[CTDB_IFACE_SIZE] != '\0') {
2584                 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2585                 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2586                                   len, len, info->name));
2587                 return -1;
2588         }
2589
2590         switch (info->link_state) {
2591         case 0:
2592                 link_up = false;
2593                 break;
2594         case 1:
2595                 link_up = true;
2596                 break;
2597         default:
2598                 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2599                                   (unsigned int)info->link_state));
2600                 return -1;
2601         }
2602
2603         if (info->references != 0) {
2604                 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2605                                   (unsigned int)info->references));
2606                 return -1;
2607         }
2608
2609         iface = ctdb_find_iface(ctdb, info->name);
2610         if (iface == NULL) {
2611                 return -1;
2612         }
2613
2614         if (link_up == iface->link_up) {
2615                 return 0;
2616         }
2617
2618         DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2619               ("iface[%s] has changed it's link status %s => %s\n",
2620                iface->name,
2621                iface->link_up?"up":"down",
2622                link_up?"up":"down"));
2623
2624         iface->link_up = link_up;
2625         return 0;
2626 }
2627
2628
2629 /* 
2630    structure containing the listening socket and the list of tcp connections
2631    that the ctdb daemon is to kill
2632 */
2633 struct ctdb_kill_tcp {
2634         struct ctdb_vnn *vnn;
2635         struct ctdb_context *ctdb;
2636         int capture_fd;
2637         struct tevent_fd *fde;
2638         trbt_tree_t *connections;
2639         void *private_data;
2640 };
2641
2642 /*
2643   a tcp connection that is to be killed
2644  */
2645 struct ctdb_killtcp_con {
2646         ctdb_sock_addr src_addr;
2647         ctdb_sock_addr dst_addr;
2648         int count;
2649         struct ctdb_kill_tcp *killtcp;
2650 };
2651
2652 /* this function is used to create a key to represent this socketpair
2653    in the killtcp tree.
2654    this key is used to insert and lookup matching socketpairs that are
2655    to be tickled and RST
2656 */
2657 #define KILLTCP_KEYLEN  10
2658 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2659 {
2660         static uint32_t key[KILLTCP_KEYLEN];
2661
2662         bzero(key, sizeof(key));
2663
2664         if (src->sa.sa_family != dst->sa.sa_family) {
2665                 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2666                 return key;
2667         }
2668         
2669         switch (src->sa.sa_family) {
2670         case AF_INET:
2671                 key[0]  = dst->ip.sin_addr.s_addr;
2672                 key[1]  = src->ip.sin_addr.s_addr;
2673                 key[2]  = dst->ip.sin_port;
2674                 key[3]  = src->ip.sin_port;
2675                 break;
2676         case AF_INET6: {
2677                 uint32_t *dst6_addr32 =
2678                         (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2679                 uint32_t *src6_addr32 =
2680                         (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2681                 key[0]  = dst6_addr32[3];
2682                 key[1]  = src6_addr32[3];
2683                 key[2]  = dst6_addr32[2];
2684                 key[3]  = src6_addr32[2];
2685                 key[4]  = dst6_addr32[1];
2686                 key[5]  = src6_addr32[1];
2687                 key[6]  = dst6_addr32[0];
2688                 key[7]  = src6_addr32[0];
2689                 key[8]  = dst->ip6.sin6_port;
2690                 key[9]  = src->ip6.sin6_port;
2691                 break;
2692         }
2693         default:
2694                 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2695                 return key;
2696         }
2697
2698         return key;
2699 }
2700
2701 /*
2702   called when we get a read event on the raw socket
2703  */
2704 static void capture_tcp_handler(struct tevent_context *ev,
2705                                 struct tevent_fd *fde,
2706                                 uint16_t flags, void *private_data)
2707 {
2708         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2709         struct ctdb_killtcp_con *con;
2710         ctdb_sock_addr src, dst;
2711         uint32_t ack_seq, seq;
2712
2713         if (!(flags & TEVENT_FD_READ)) {
2714                 return;
2715         }
2716
2717         if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2718                                 killtcp->private_data,
2719                                 &src, &dst,
2720                                 &ack_seq, &seq) != 0) {
2721                 /* probably a non-tcp ACK packet */
2722                 return;
2723         }
2724
2725         /* check if we have this guy in our list of connections
2726            to kill
2727         */
2728         con = trbt_lookuparray32(killtcp->connections, 
2729                         KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2730         if (con == NULL) {
2731                 /* no this was some other packet we can just ignore */
2732                 return;
2733         }
2734
2735         /* This one has been tickled !
2736            now reset him and remove him from the list.
2737          */
2738         DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2739                 ntohs(con->dst_addr.ip.sin_port),
2740                 ctdb_addr_to_str(&con->src_addr),
2741                 ntohs(con->src_addr.ip.sin_port)));
2742
2743         ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2744         talloc_free(con);
2745 }
2746
2747
2748 /* when traversing the list of all tcp connections to send tickle acks to
2749    (so that we can capture the ack coming back and kill the connection
2750     by a RST)
2751    this callback is called for each connection we are currently trying to kill
2752 */
2753 static int tickle_connection_traverse(void *param, void *data)
2754 {
2755         struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2756
2757         /* have tried too many times, just give up */
2758         if (con->count >= 5) {
2759                 /* can't delete in traverse: reparent to delete_cons */
2760                 talloc_steal(param, con);
2761                 return 0;
2762         }
2763
2764         /* othervise, try tickling it again */
2765         con->count++;
2766         ctdb_sys_send_tcp(
2767                 (ctdb_sock_addr *)&con->dst_addr,
2768                 (ctdb_sock_addr *)&con->src_addr,
2769                 0, 0, 0);
2770         return 0;
2771 }
2772
2773
2774 /* 
2775    called every second until all sentenced connections have been reset
2776  */
2777 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
2778                                               struct tevent_timer *te,
2779                                               struct timeval t, void *private_data)
2780 {
2781         struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2782         void *delete_cons = talloc_new(NULL);
2783
2784         /* loop over all connections sending tickle ACKs */
2785         trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2786
2787         /* now we've finished traverse, it's safe to do deletion. */
2788         talloc_free(delete_cons);
2789
2790         /* If there are no more connections to kill we can remove the
2791            entire killtcp structure
2792          */
2793         if ( (killtcp->connections == NULL) || 
2794              (killtcp->connections->root == NULL) ) {
2795                 talloc_free(killtcp);
2796                 return;
2797         }
2798
2799         /* try tickling them again in a seconds time
2800          */
2801         tevent_add_timer(killtcp->ctdb->ev, killtcp,
2802                          timeval_current_ofs(1, 0),
2803                          ctdb_tickle_sentenced_connections, killtcp);
2804 }
2805
2806 /*
2807   destroy the killtcp structure
2808  */
2809 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2810 {
2811         struct ctdb_vnn *tmpvnn;
2812
2813         /* verify that this vnn is still active */
2814         for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2815                 if (tmpvnn == killtcp->vnn) {
2816                         break;
2817                 }
2818         }
2819
2820         if (tmpvnn == NULL) {
2821                 return 0;
2822         }
2823
2824         if (killtcp->vnn->killtcp != killtcp) {
2825                 return 0;
2826         }
2827
2828         killtcp->vnn->killtcp = NULL;
2829
2830         return 0;
2831 }
2832
2833
2834 /* nothing fancy here, just unconditionally replace any existing
2835    connection structure with the new one.
2836
2837    don't even free the old one if it did exist, that one is talloc_stolen
2838    by the same node in the tree anyway and will be deleted when the new data 
2839    is deleted
2840 */
2841 static void *add_killtcp_callback(void *parm, void *data)
2842 {
2843         return parm;
2844 }
2845
2846 /*
2847   add a tcp socket to the list of connections we want to RST
2848  */
2849 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
2850                                        ctdb_sock_addr *s,
2851                                        ctdb_sock_addr *d)
2852 {
2853         ctdb_sock_addr src, dst;
2854         struct ctdb_kill_tcp *killtcp;
2855         struct ctdb_killtcp_con *con;
2856         struct ctdb_vnn *vnn;
2857
2858         ctdb_canonicalize_ip(s, &src);
2859         ctdb_canonicalize_ip(d, &dst);
2860
2861         vnn = find_public_ip_vnn(ctdb, &dst);
2862         if (vnn == NULL) {
2863                 vnn = find_public_ip_vnn(ctdb, &src);
2864         }
2865         if (vnn == NULL) {
2866                 /* if it is not a public ip   it could be our 'single ip' */
2867                 if (ctdb->single_ip_vnn) {
2868                         if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2869                                 vnn = ctdb->single_ip_vnn;
2870                         }
2871                 }
2872         }
2873         if (vnn == NULL) {
2874                 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
2875                 return -1;
2876         }
2877
2878         killtcp = vnn->killtcp;
2879         
2880         /* If this is the first connection to kill we must allocate
2881            a new structure
2882          */
2883         if (killtcp == NULL) {
2884                 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
2885                 CTDB_NO_MEMORY(ctdb, killtcp);
2886
2887                 killtcp->vnn         = vnn;
2888                 killtcp->ctdb        = ctdb;
2889                 killtcp->capture_fd  = -1;
2890                 killtcp->connections = trbt_create(killtcp, 0);
2891
2892                 vnn->killtcp         = killtcp;
2893                 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2894         }
2895
2896
2897
2898         /* create a structure that describes this connection we want to
2899            RST and store it in killtcp->connections
2900         */
2901         con = talloc(killtcp, struct ctdb_killtcp_con);
2902         CTDB_NO_MEMORY(ctdb, con);
2903         con->src_addr = src;
2904         con->dst_addr = dst;
2905         con->count    = 0;
2906         con->killtcp  = killtcp;
2907
2908
2909         trbt_insertarray32_callback(killtcp->connections,
2910                         KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2911                         add_killtcp_callback, con);
2912
2913         /* 
2914            If we don't have a socket to listen on yet we must create it
2915          */
2916         if (killtcp->capture_fd == -1) {
2917                 const char *iface = ctdb_vnn_iface_string(vnn);
2918                 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2919                 if (killtcp->capture_fd == -1) {
2920                         DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2921                                           "socket on iface '%s' for killtcp (%s)\n",
2922                                           iface, strerror(errno)));
2923                         goto failed;
2924                 }
2925         }
2926
2927
2928         if (killtcp->fde == NULL) {
2929                 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
2930                                              killtcp->capture_fd,
2931                                              TEVENT_FD_READ,
2932                                              capture_tcp_handler, killtcp);
2933                 tevent_fd_set_auto_close(killtcp->fde);
2934
2935                 /* We also need to set up some events to tickle all these connections
2936                    until they are all reset
2937                 */
2938                 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2939                                  ctdb_tickle_sentenced_connections, killtcp);
2940         }
2941
2942         /* tickle him once now */
2943         ctdb_sys_send_tcp(
2944                 &con->dst_addr,
2945                 &con->src_addr,
2946                 0, 0, 0);
2947
2948         return 0;
2949
2950 failed:
2951         talloc_free(vnn->killtcp);
2952         vnn->killtcp = NULL;
2953         return -1;
2954 }
2955
2956 /*
2957   kill a TCP connection.
2958  */
2959 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2960 {
2961         struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
2962
2963         return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
2964 }
2965
2966 /*
2967   called by a daemon to inform us of the entire list of TCP tickles for
2968   a particular public address.
2969   this control should only be sent by the node that is currently serving
2970   that public address.
2971  */
2972 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2973 {
2974         struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2975         struct ctdb_tcp_array *tcparray;
2976         struct ctdb_vnn *vnn;
2977
2978         /* We must at least have tickles.num or else we cant verify the size
2979            of the received data blob
2980          */
2981         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
2982                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
2983                 return -1;
2984         }
2985
2986         /* verify that the size of data matches what we expect */
2987         if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
2988                          + sizeof(struct ctdb_connection) * list->num) {
2989                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
2990                 return -1;
2991         }
2992
2993         DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
2994                            ctdb_addr_to_str(&list->addr)));
2995
2996         vnn = find_public_ip_vnn(ctdb, &list->addr);
2997         if (vnn == NULL) {
2998                 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2999                         ctdb_addr_to_str(&list->addr)));
3000
3001                 return 1;
3002         }
3003
3004         if (vnn->pnn == ctdb->pnn) {
3005                 DEBUG(DEBUG_INFO,
3006                       ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
3007                        ctdb_addr_to_str(&list->addr)));
3008                 return 0;
3009         }
3010
3011         /* remove any old ticklelist we might have */
3012         talloc_free(vnn->tcp_array);
3013         vnn->tcp_array = NULL;
3014
3015         tcparray = talloc(vnn, struct ctdb_tcp_array);
3016         CTDB_NO_MEMORY(ctdb, tcparray);
3017
3018         tcparray->num = list->num;
3019
3020         tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3021         CTDB_NO_MEMORY(ctdb, tcparray->connections);
3022
3023         memcpy(tcparray->connections, &list->connections[0],
3024                sizeof(struct ctdb_connection)*tcparray->num);
3025
3026         /* We now have a new fresh tickle list array for this vnn */
3027         vnn->tcp_array = tcparray;
3028
3029         return 0;
3030 }
3031
3032 /*
3033   called to return the full list of tickles for the puclic address associated 
3034   with the provided vnn
3035  */
3036 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3037 {
3038         ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3039         struct ctdb_tickle_list_old *list;
3040         struct ctdb_tcp_array *tcparray;
3041         int num;
3042         struct ctdb_vnn *vnn;
3043
3044         vnn = find_public_ip_vnn(ctdb, addr);
3045         if (vnn == NULL) {
3046                 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
3047                         ctdb_addr_to_str(addr)));
3048
3049                 return 1;
3050         }
3051
3052         tcparray = vnn->tcp_array;
3053         if (tcparray) {
3054                 num = tcparray->num;
3055         } else {
3056                 num = 0;
3057         }
3058
3059         outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3060                         + sizeof(struct ctdb_connection) * num;
3061
3062         outdata->dptr  = talloc_size(outdata, outdata->dsize);
3063         CTDB_NO_MEMORY(ctdb, outdata->dptr);
3064         list = (struct ctdb_tickle_list_old *)outdata->dptr;
3065
3066         list->addr = *addr;
3067         list->num = num;
3068         if (num) {
3069                 memcpy(&list->connections[0], tcparray->connections,
3070                         sizeof(struct ctdb_connection) * num);
3071         }
3072
3073         return 0;
3074 }
3075
3076
3077 /*
3078   set the list of all tcp tickles for a public address
3079  */
3080 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3081                                             ctdb_sock_addr *addr,
3082                                             struct ctdb_tcp_array *tcparray)
3083 {
3084         int ret, num;
3085         TDB_DATA data;
3086         struct ctdb_tickle_list_old *list;
3087
3088         if (tcparray) {
3089                 num = tcparray->num;
3090         } else {
3091                 num = 0;
3092         }
3093
3094         data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3095                         sizeof(struct ctdb_connection) * num;
3096         data.dptr = talloc_size(ctdb, data.dsize);
3097         CTDB_NO_MEMORY(ctdb, data.dptr);
3098
3099         list = (struct ctdb_tickle_list_old *)data.dptr;
3100         list->addr = *addr;
3101         list->num = num;
3102         if (tcparray) {
3103                 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3104         }
3105
3106         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3107                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3108                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3109         if (ret != 0) {
3110                 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3111                 return -1;
3112         }
3113
3114         talloc_free(data.dptr);
3115
3116         return ret;
3117 }
3118
3119
3120 /*
3121   perform tickle updates if required
3122  */
3123 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3124                                     struct tevent_timer *te,
3125                                     struct timeval t, void *private_data)
3126 {
3127         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3128         int ret;
3129         struct ctdb_vnn *vnn;
3130
3131         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3132                 /* we only send out updates for public addresses that 
3133                    we have taken over
3134                  */
3135                 if (ctdb->pnn != vnn->pnn) {
3136                         continue;
3137                 }
3138                 /* We only send out the updates if we need to */
3139                 if (!vnn->tcp_update_needed) {
3140                         continue;
3141                 }
3142                 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3143                                                        &vnn->public_address,
3144                                                        vnn->tcp_array);
3145                 if (ret != 0) {
3146                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3147                                 ctdb_addr_to_str(&vnn->public_address)));
3148                 } else {
3149                         DEBUG(DEBUG_INFO,
3150                               ("Sent tickle update for public address %s\n",
3151                                ctdb_addr_to_str(&vnn->public_address)));
3152                         vnn->tcp_update_needed = false;
3153                 }
3154         }
3155
3156         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3157                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3158                          ctdb_update_tcp_tickles, ctdb);
3159 }
3160
3161 /*
3162   start periodic update of tcp tickles
3163  */
3164 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3165 {
3166         ctdb->tickle_update_context = talloc_new(ctdb);
3167
3168         tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3169                          timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3170                          ctdb_update_tcp_tickles, ctdb);
3171 }
3172
3173
3174
3175
3176 struct control_gratious_arp {
3177         struct ctdb_context *ctdb;
3178         ctdb_sock_addr addr;
3179         const char *iface;
3180         int count;
3181 };
3182
3183 /*
3184   send a control_gratuitous arp
3185  */
3186 static void send_gratious_arp(struct tevent_context *ev,
3187                               struct tevent_timer *te,
3188                               struct timeval t, void *private_data)
3189 {
3190         int ret;
3191         struct control_gratious_arp *arp = talloc_get_type(private_data, 
3192                                                         struct control_gratious_arp);
3193
3194         ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3195         if (ret != 0) {
3196                 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3197                                  arp->iface, strerror(errno)));
3198         }
3199
3200
3201         arp->count++;
3202         if (arp->count == CTDB_ARP_REPEAT) {
3203                 talloc_free(arp);
3204                 return;
3205         }
3206
3207         tevent_add_timer(arp->ctdb->ev, arp,
3208                          timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3209                          send_gratious_arp, arp);
3210 }
3211
3212
3213 /*
3214   send a gratious arp 
3215  */
3216 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3217 {
3218         struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3219         struct control_gratious_arp *arp;
3220
3221         /* verify the size of indata */
3222         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3223                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
3224                                  (unsigned)indata.dsize, 
3225                                  (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3226                 return -1;
3227         }
3228         if (indata.dsize != 
3229                 ( offsetof(struct ctdb_addr_info_old, iface)
3230                 + gratious_arp->len ) ){
3231
3232                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3233                         "but should be %u bytes\n", 
3234                          (unsigned)indata.dsize, 
3235                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3236                 return -1;
3237         }
3238
3239
3240         arp = talloc(ctdb, struct control_gratious_arp);
3241         CTDB_NO_MEMORY(ctdb, arp);
3242
3243         arp->ctdb  = ctdb;
3244         arp->addr   = gratious_arp->addr;
3245         arp->iface = talloc_strdup(arp, gratious_arp->iface);
3246         CTDB_NO_MEMORY(ctdb, arp->iface);
3247         arp->count = 0;
3248
3249         tevent_add_timer(arp->ctdb->ev, arp,
3250                          timeval_zero(), send_gratious_arp, arp);
3251
3252         return 0;
3253 }
3254
3255 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3256 {
3257         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3258         int ret;
3259
3260         /* verify the size of indata */
3261         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3262                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3263                 return -1;
3264         }
3265         if (indata.dsize != 
3266                 ( offsetof(struct ctdb_addr_info_old, iface)
3267                 + pub->len ) ){
3268
3269                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3270                         "but should be %u bytes\n", 
3271                          (unsigned)indata.dsize, 
3272                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3273                 return -1;
3274         }
3275
3276         DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3277
3278         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3279
3280         if (ret != 0) {
3281                 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3282                 return -1;
3283         }
3284
3285         return 0;
3286 }
3287
3288 struct delete_ip_callback_state {
3289         struct ctdb_req_control_old *c;
3290 };
3291
3292 /*
3293   called when releaseip event finishes for del_public_address
3294  */
3295 static void delete_ip_callback(struct ctdb_context *ctdb,
3296                                int32_t status, TDB_DATA data,
3297                                const char *errormsg,
3298                                void *private_data)
3299 {
3300         struct delete_ip_callback_state *state =
3301                 talloc_get_type(private_data, struct delete_ip_callback_state);
3302
3303         /* If release failed then fail. */
3304         ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
3305         talloc_free(private_data);
3306 }
3307
3308 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
3309                                         struct ctdb_req_control_old *c,
3310                                         TDB_DATA indata, bool *async_reply)
3311 {
3312         struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3313         struct ctdb_vnn *vnn;
3314
3315         /* verify the size of indata */
3316         if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3317                 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3318                 return -1;
3319         }
3320         if (indata.dsize != 
3321                 ( offsetof(struct ctdb_addr_info_old, iface)
3322                 + pub->len ) ){
3323
3324                 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3325                         "but should be %u bytes\n", 
3326                          (unsigned)indata.dsize, 
3327                          (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3328                 return -1;
3329         }
3330
3331         DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3332
3333         /* walk over all public addresses until we find a match */
3334         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3335                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3336                         if (vnn->pnn == ctdb->pnn) {
3337                                 struct delete_ip_callback_state *state;
3338                                 struct ctdb_public_ip *ip;
3339                                 TDB_DATA data;
3340                                 int ret;
3341
3342                                 vnn->delete_pending = true;
3343
3344                                 state = talloc(ctdb,
3345                                                struct delete_ip_callback_state);
3346                                 CTDB_NO_MEMORY(ctdb, state);
3347                                 state->c = c;
3348
3349                                 ip = talloc(state, struct ctdb_public_ip);
3350                                 if (ip == NULL) {
3351                                         DEBUG(DEBUG_ERR,
3352                                               (__location__ " Out of memory\n"));
3353                                         talloc_free(state);
3354                                         return -1;
3355                                 }
3356                                 ip->pnn = -1;
3357                                 ip->addr = pub->addr;
3358
3359                                 data.dsize = sizeof(struct ctdb_public_ip);
3360                                 data.dptr = (unsigned char *)ip;
3361
3362                                 ret = ctdb_daemon_send_control(ctdb,
3363                                                                ctdb_get_pnn(ctdb),
3364                                                                0,
3365                                                                CTDB_CONTROL_RELEASE_IP,
3366                                                                0, 0,
3367                                                                data,
3368                                                                delete_ip_callback,
3369                                                                state);
3370                                 if (ret == -1) {
3371                                         DEBUG(DEBUG_ERR,
3372                                               (__location__ "Unable to send "
3373                                                "CTDB_CONTROL_RELEASE_IP\n"));
3374                                         talloc_free(state);
3375                                         return -1;
3376                                 }
3377
3378                                 state->c = talloc_steal(state, c);
3379                                 *async_reply = true;
3380                         } else {
3381                                 /* This IP is not hosted on the
3382                                  * current node so just delete it
3383                                  * now. */
3384                                 do_delete_ip(ctdb, vnn);
3385                         }
3386
3387                         return 0;
3388                 }
3389         }
3390
3391         DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
3392                          ctdb_addr_to_str(&pub->addr)));
3393         return -1;
3394 }
3395
3396
3397 struct ipreallocated_callback_state {
3398         struct ctdb_req_control_old *c;
3399 };
3400
3401 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3402                                         int status, void *p)
3403 {
3404         struct ipreallocated_callback_state *state =
3405                 talloc_get_type(p, struct ipreallocated_callback_state);
3406
3407         if (status != 0) {
3408                 DEBUG(DEBUG_ERR,
3409                       (" \"ipreallocated\" event script failed (status %d)\n",
3410                        status));
3411                 if (status == -ETIME) {
3412                         ctdb_ban_self(ctdb);
3413                 }
3414         }
3415
3416         ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3417         talloc_free(state);
3418 }
3419
3420 /* A control to run the ipreallocated event */
3421 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3422                                    struct ctdb_req_control_old *c,
3423                                    bool *async_reply)
3424 {
3425         int ret;
3426         struct ipreallocated_callback_state *state;
3427
3428         state = talloc(ctdb, struct ipreallocated_callback_state);
3429         CTDB_NO_MEMORY(ctdb, state);
3430
3431         DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3432
3433         ret = ctdb_event_script_callback(ctdb, state,
3434                                          ctdb_ipreallocated_callback, state,
3435                                          CTDB_EVENT_IPREALLOCATED,
3436                                          "%s", "");
3437
3438         if (ret != 0) {
3439                 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3440                 talloc_free(state);
3441                 return -1;
3442         }
3443
3444         /* tell the control that we will be reply asynchronously */
3445         state->c    = talloc_steal(state, c);
3446         *async_reply = true;
3447
3448         return 0;
3449 }
3450
3451
3452 /* This function is called from the recovery daemon to verify that a remote
3453    node has the expected ip allocation.
3454    This is verified against ctdb->ip_tree
3455 */
3456 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
3457                                        struct ctdb_public_ip_list *ips,
3458                                        uint32_t pnn)
3459 {
3460         struct public_ip_list *tmp_ip;
3461         int i;
3462
3463         if (ctdb->ip_tree == NULL) {
3464                 /* don't know the expected allocation yet, assume remote node
3465                    is correct. */
3466                 return 0;
3467         }
3468
3469         if (ips == NULL) {
3470                 return 0;
3471         }
3472
3473         for (i=0; i<ips->num; i++) {
3474                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ip[i].addr));
3475                 if (tmp_ip == NULL) {
3476                         DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ip[i].addr)));
3477                         return -1;
3478                 }
3479
3480                 if (tmp_ip->pnn == -1 || ips->ip[i].pnn == -1) {
3481                         continue;
3482                 }
3483
3484                 if (tmp_ip->pnn != ips->ip[i].pnn) {
3485                         DEBUG(DEBUG_ERR,
3486                               ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
3487                                pnn,
3488                                ctdb_addr_to_str(&ips->ip[i].addr),
3489                                ips->ip[i].pnn, tmp_ip->pnn));
3490                         return -1;
3491                 }
3492         }
3493
3494         return 0;
3495 }
3496
3497 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3498 {
3499         struct public_ip_list *tmp_ip;
3500
3501         /* IP tree is never built if DisableIPFailover is set */
3502         if (ctdb->tunable.disable_ip_failover != 0) {
3503                 return 0;
3504         }
3505
3506         if (ctdb->ip_tree == NULL) {
3507                 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3508                 return -1;
3509         }
3510
3511         tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3512         if (tmp_ip == NULL) {
3513                 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3514                 return -1;
3515         }
3516
3517         DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3518         tmp_ip->pnn = ip->pnn;
3519
3520         return 0;
3521 }
3522
3523 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
3524 {
3525         TALLOC_FREE(ctdb->ip_tree);
3526 }
3527
3528 struct ctdb_reloadips_handle {
3529         struct ctdb_context *ctdb;
3530         struct ctdb_req_control_old *c;
3531         int status;
3532         int fd[2];
3533         pid_t child;
3534         struct tevent_fd *fde;
3535 };
3536
3537 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3538 {
3539         if (h == h->ctdb->reload_ips) {
3540                 h->ctdb->reload_ips = NULL;
3541         }
3542         if (h->c != NULL) {
3543                 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3544                 h->c = NULL;
3545         }
3546         ctdb_kill(h->ctdb, h->child, SIGKILL);
3547         return 0;
3548 }
3549
3550 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
3551                                          struct tevent_timer *te,
3552                                          struct timeval t, void *private_data)
3553 {
3554         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3555
3556         talloc_free(h);
3557 }
3558
3559 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
3560                                          struct tevent_fd *fde,
3561                                          uint16_t flags, void *private_data)
3562 {
3563         struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3564
3565         char res;
3566         int ret;
3567
3568         ret = sys_read(h->fd[0], &res, 1);
3569         if (ret < 1 || res != 0) {
3570                 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3571                 res = 1;
3572         }
3573         h->status = res;
3574
3575         talloc_free(h);
3576 }
3577
3578 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3579 {
3580         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3581         struct ctdb_public_ip_list_old *ips;
3582         struct ctdb_vnn *vnn;
3583         struct client_async_data *async_data;
3584         struct timeval timeout;
3585         TDB_DATA data;
3586         struct ctdb_client_control_state *state;
3587         bool first_add;
3588         int i, ret;
3589
3590         CTDB_NO_MEMORY(ctdb, mem_ctx);
3591
3592         /* Read IPs from local node */
3593         ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
3594                                        CTDB_CURRENT_NODE, mem_ctx, &ips);
3595         if (ret != 0) {
3596                 DEBUG(DEBUG_ERR,
3597                       ("Unable to fetch public IPs from local node\n"));
3598                 talloc_free(mem_ctx);
3599                 return -1;
3600         }
3601
3602         /* Read IPs file - this is safe since this is a child process */
3603         ctdb->vnn = NULL;
3604         if (ctdb_set_public_addresses(ctdb, false) != 0) {
3605                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3606                 talloc_free(mem_ctx);
3607                 return -1;
3608         }
3609
3610         async_data = talloc_zero(mem_ctx, struct client_async_data);
3611         CTDB_NO_MEMORY(ctdb, async_data);
3612
3613         /* Compare IPs between node and file for IPs to be deleted */
3614         for (i = 0; i < ips->num; i++) {
3615                 /* */
3616                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3617                         if (ctdb_same_ip(&vnn->public_address,
3618                                          &ips->ips[i].addr)) {
3619                                 /* IP is still in file */
3620                                 break;
3621                         }
3622                 }
3623
3624                 if (vnn == NULL) {
3625                         /* Delete IP ips->ips[i] */
3626                         struct ctdb_addr_info_old *pub;
3627
3628                         DEBUG(DEBUG_NOTICE,
3629                               ("IP %s no longer configured, deleting it\n",
3630                                ctdb_addr_to_str(&ips->ips[i].addr)));
3631
3632                         pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
3633                         CTDB_NO_MEMORY(ctdb, pub);
3634
3635                         pub->addr  = ips->ips[i].addr;
3636                         pub->mask  = 0;
3637                         pub->len   = 0;
3638
3639                         timeout = TAKEOVER_TIMEOUT();
3640
3641                         data.dsize = offsetof(struct ctdb_addr_info_old,
3642                                               iface) + pub->len;
3643                         data.dptr = (uint8_t *)pub;
3644
3645                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3646                                                   CTDB_CONTROL_DEL_PUBLIC_IP,
3647                                                   0, data, async_data,
3648                                                   &timeout, NULL);
3649                         if (state == NULL) {
3650                                 DEBUG(DEBUG_ERR,
3651                                       (__location__
3652                                        " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
3653                                 goto failed;
3654                         }
3655
3656                         ctdb_client_async_add(async_data, state);
3657                 }
3658         }
3659
3660         /* Compare IPs between node and file for IPs to be added */
3661         first_add = true;
3662         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3663                 for (i = 0; i < ips->num; i++) {
3664                         if (ctdb_same_ip(&vnn->public_address,
3665                                          &ips->ips[i].addr)) {
3666                                 /* IP already on node */
3667                                 break;
3668                         }
3669                 }
3670                 if (i == ips->num) {
3671                         /* Add IP ips->ips[i] */
3672                         struct ctdb_addr_info_old *pub;
3673                         const char *ifaces = NULL;
3674                         uint32_t len;
3675                         int iface = 0;
3676
3677                         DEBUG(DEBUG_NOTICE,
3678                               ("New IP %s configured, adding it\n",
3679                                ctdb_addr_to_str(&vnn->public_address)));
3680                         if (first_add) {
3681                                 uint32_t pnn = ctdb_get_pnn(ctdb);
3682
3683                                 data.dsize = sizeof(pnn);
3684                                 data.dptr  = (uint8_t *)&pnn;
3685
3686                                 ret = ctdb_client_send_message(
3687                                         ctdb,
3688                                         CTDB_BROADCAST_CONNECTED,
3689                                         CTDB_SRVID_REBALANCE_NODE,
3690                                         data);
3691                                 if (ret != 0) {
3692                                         DEBUG(DEBUG_WARNING,
3693                                               ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
3694                                 }
3695
3696                                 first_add = false;
3697                         }
3698
3699                         ifaces = vnn->ifaces[0];
3700                         iface = 1;
3701                         while (vnn->ifaces[iface] != NULL) {
3702                                 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
3703                                                          vnn->ifaces[iface]);
3704                                 iface++;
3705                         }
3706
3707                         len   = strlen(ifaces) + 1;
3708                         pub = talloc_zero_size(mem_ctx,
3709                                                offsetof(struct ctdb_addr_info_old, iface) + len);
3710                         CTDB_NO_MEMORY(ctdb, pub);
3711
3712                         pub->addr  = vnn->public_address;
3713                         pub->mask  = vnn->public_netmask_bits;
3714                         pub->len   = len;
3715                         memcpy(&pub->iface[0], ifaces, pub->len);
3716
3717                         timeout = TAKEOVER_TIMEOUT();
3718
3719                         data.dsize = offsetof(struct ctdb_addr_info_old,
3720                                               iface) + pub->len;
3721                         data.dptr = (uint8_t *)pub;
3722
3723                         state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3724                                                   CTDB_CONTROL_ADD_PUBLIC_IP,
3725                                                   0, data, async_data,
3726                                                   &timeout, NULL);
3727                         if (state == NULL) {
3728                                 DEBUG(DEBUG_ERR,
3729                                       (__location__
3730                                        " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
3731                                 goto failed;
3732                         }
3733
3734                         ctdb_client_async_add(async_data, state);
3735                 }
3736         }
3737
3738         if (ctdb_client_async_wait(ctdb, async_data) != 0) {
3739                 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
3740                 goto failed;
3741         }
3742
3743         talloc_free(mem_ctx);
3744         return 0;
3745
3746 failed:
3747         talloc_free(mem_ctx);
3748         return -1;
3749 }
3750
3751 /* This control is sent to force the node to re-read the public addresses file
3752    and drop any addresses we should nnot longer host, and add new addresses
3753    that we are now able to host
3754 */
3755 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
3756 {
3757         struct ctdb_reloadips_handle *h;
3758         pid_t parent = getpid();
3759
3760         if (ctdb->reload_ips != NULL) {
3761                 talloc_free(ctdb->reload_ips);
3762                 ctdb->reload_ips = NULL;
3763         }
3764
3765         h = talloc(ctdb, struct ctdb_reloadips_handle);
3766         CTDB_NO_MEMORY(ctdb, h);
3767         h->ctdb     = ctdb;
3768         h->c        = NULL;
3769         h->status   = -1;
3770         
3771         if (pipe(h->fd) == -1) {
3772                 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3773                 talloc_free(h);
3774                 return -1;
3775         }
3776
3777         h->child = ctdb_fork(ctdb);
3778         if (h->child == (pid_t)-1) {
3779                 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3780                 close(h->fd[0]);
3781                 close(h->fd[1]);
3782                 talloc_free(h);
3783                 return -1;
3784         }
3785
3786         /* child process */
3787         if (h->child == 0) {
3788                 signed char res = 0;
3789
3790                 close(h->fd[0]);
3791                 debug_extra = talloc_asprintf(NULL, "reloadips:");
3792
3793                 prctl_set_comment("ctdb_reloadips");
3794                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3795                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3796                         res = -1;
3797                 } else {
3798                         res = ctdb_reloadips_child(ctdb);
3799                         if (res != 0) {
3800                                 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3801                         }
3802                 }
3803
3804                 sys_write(h->fd[1], &res, 1);
3805                 ctdb_wait_for_process_to_exit(parent);
3806                 _exit(0);
3807         }
3808
3809         h->c             = talloc_steal(h, c);
3810
3811         close(h->fd[1]);
3812         set_close_on_exec(h->fd[0]);
3813
3814         talloc_set_destructor(h, ctdb_reloadips_destructor);
3815
3816
3817         h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3818                                ctdb_reloadips_child_handler, (void *)h);
3819         tevent_fd_set_auto_close(h->fde);
3820
3821         tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3822                          ctdb_reloadips_timeout_event, h);
3823
3824         /* we reply later */
3825         *async_reply = true;
3826         return 0;
3827 }