4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 struct ctdb_interface {
57 struct ctdb_interface *prev, *next;
63 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
66 return vnn->iface->name;
72 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
74 struct ctdb_interface *i;
76 /* Verify that we don't have an entry for this ip yet */
77 for (i=ctdb->ifaces;i;i=i->next) {
78 if (strcmp(i->name, iface) == 0) {
83 /* create a new structure for this interface */
84 i = talloc_zero(ctdb, struct ctdb_interface);
85 CTDB_NO_MEMORY_FATAL(ctdb, i);
86 i->name = talloc_strdup(i, iface);
87 CTDB_NO_MEMORY(ctdb, i->name);
91 DLIST_ADD(ctdb->ifaces, i);
96 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
101 for (n = 0; vnn->ifaces[n] != NULL; n++) {
102 if (strcmp(name, vnn->ifaces[n]) == 0) {
110 /* If any interfaces now have no possible IPs then delete them. This
111 * implementation is naive (i.e. simple) rather than clever
112 * (i.e. complex). Given that this is run on delip and that operation
113 * is rare, this doesn't need to be efficient - it needs to be
114 * foolproof. One alternative is reference counting, where the logic
115 * is distributed and can, therefore, be broken in multiple places.
116 * Another alternative is to build a red-black tree of interfaces that
117 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
118 * once) and then walking ctdb->ifaces once and deleting those not in
119 * the tree. Let's go to one of those if the naive implementation
120 * causes problems... :-)
122 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
123 struct ctdb_vnn *vnn)
125 struct ctdb_interface *i, *next;
127 /* For each interface, check if there's an IP using it. */
128 for (i = ctdb->ifaces; i != NULL; i = next) {
133 /* Only consider interfaces named in the given VNN. */
134 if (!vnn_has_interface_with_name(vnn, i->name)) {
138 /* Is the "single IP" on this interface? */
139 if ((ctdb->single_ip_vnn != NULL) &&
140 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
141 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
142 /* Found, next interface please... */
145 /* Search for a vnn with this interface. */
147 for (tv=ctdb->vnn; tv; tv=tv->next) {
148 if (vnn_has_interface_with_name(tv, i->name)) {
155 /* None of the VNNs are using this interface. */
156 DLIST_REMOVE(ctdb->ifaces, i);
163 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
166 struct ctdb_interface *i;
168 for (i=ctdb->ifaces;i;i=i->next) {
169 if (strcmp(i->name, iface) == 0) {
177 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
178 struct ctdb_vnn *vnn)
181 struct ctdb_interface *cur = NULL;
182 struct ctdb_interface *best = NULL;
184 for (i=0; vnn->ifaces[i]; i++) {
186 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
200 if (cur->references < best->references) {
209 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
210 struct ctdb_vnn *vnn)
212 struct ctdb_interface *best = NULL;
215 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
216 "still assigned to iface '%s'\n",
217 ctdb_addr_to_str(&vnn->public_address),
218 ctdb_vnn_iface_string(vnn)));
222 best = ctdb_vnn_best_iface(ctdb, vnn);
224 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
225 "cannot assign to iface any iface\n",
226 ctdb_addr_to_str(&vnn->public_address)));
232 vnn->pnn = ctdb->pnn;
234 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
235 "now assigned to iface '%s' refs[%d]\n",
236 ctdb_addr_to_str(&vnn->public_address),
237 ctdb_vnn_iface_string(vnn),
242 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
243 struct ctdb_vnn *vnn)
245 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
246 "now unassigned (old iface '%s' refs[%d])\n",
247 ctdb_addr_to_str(&vnn->public_address),
248 ctdb_vnn_iface_string(vnn),
249 vnn->iface?vnn->iface->references:0));
251 vnn->iface->references--;
254 if (vnn->pnn == ctdb->pnn) {
259 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
264 /* Nodes that are not RUNNING can not host IPs */
265 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
269 if (vnn->delete_pending) {
273 if (vnn->iface && vnn->iface->link_up) {
277 for (i=0; vnn->ifaces[i]; i++) {
278 struct ctdb_interface *cur;
280 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
293 struct ctdb_takeover_arp {
294 struct ctdb_context *ctdb;
297 struct ctdb_tcp_array *tcparray;
298 struct ctdb_vnn *vnn;
303 lists of tcp endpoints
305 struct ctdb_tcp_list {
306 struct ctdb_tcp_list *prev, *next;
307 struct ctdb_connection connection;
311 list of clients to kill on IP release
313 struct ctdb_client_ip {
314 struct ctdb_client_ip *prev, *next;
315 struct ctdb_context *ctdb;
322 send a gratuitous arp
324 static void ctdb_control_send_arp(struct tevent_context *ev,
325 struct tevent_timer *te,
326 struct timeval t, void *private_data)
328 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
329 struct ctdb_takeover_arp);
331 struct ctdb_tcp_array *tcparray;
332 const char *iface = ctdb_vnn_iface_string(arp->vnn);
334 ret = ctdb_sys_send_arp(&arp->addr, iface);
336 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
337 iface, strerror(errno)));
340 tcparray = arp->tcparray;
342 for (i=0;i<tcparray->num;i++) {
343 struct ctdb_connection *tcon;
345 tcon = &tcparray->connections[i];
346 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
347 (unsigned)ntohs(tcon->dst.ip.sin_port),
348 ctdb_addr_to_str(&tcon->src),
349 (unsigned)ntohs(tcon->src.ip.sin_port)));
350 ret = ctdb_sys_send_tcp(
355 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
356 ctdb_addr_to_str(&tcon->src)));
363 if (arp->count == CTDB_ARP_REPEAT) {
368 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
369 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
370 ctdb_control_send_arp, arp);
373 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
374 struct ctdb_vnn *vnn)
376 struct ctdb_takeover_arp *arp;
377 struct ctdb_tcp_array *tcparray;
379 if (!vnn->takeover_ctx) {
380 vnn->takeover_ctx = talloc_new(vnn);
381 if (!vnn->takeover_ctx) {
386 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
392 arp->addr = vnn->public_address;
395 tcparray = vnn->tcp_array;
397 /* add all of the known tcp connections for this IP to the
398 list of tcp connections to send tickle acks for */
399 arp->tcparray = talloc_steal(arp, tcparray);
401 vnn->tcp_array = NULL;
402 vnn->tcp_update_needed = true;
405 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
406 timeval_zero(), ctdb_control_send_arp, arp);
411 struct takeover_callback_state {
412 struct ctdb_req_control_old *c;
413 ctdb_sock_addr *addr;
414 struct ctdb_vnn *vnn;
417 struct ctdb_do_takeip_state {
418 struct ctdb_req_control_old *c;
419 struct ctdb_vnn *vnn;
423 called when takeip event finishes
425 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
428 struct ctdb_do_takeip_state *state =
429 talloc_get_type(private_data, struct ctdb_do_takeip_state);
434 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
436 if (status == -ETIME) {
439 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
440 ctdb_addr_to_str(&state->vnn->public_address),
441 ctdb_vnn_iface_string(state->vnn)));
442 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
444 node->flags |= NODE_FLAGS_UNHEALTHY;
449 if (ctdb->do_checkpublicip) {
451 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
453 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
460 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
461 data.dsize = strlen((char *)data.dptr) + 1;
462 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
464 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
467 /* the control succeeded */
468 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
473 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
475 state->vnn->update_in_flight = false;
480 take over an ip address
482 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
483 struct ctdb_req_control_old *c,
484 struct ctdb_vnn *vnn)
487 struct ctdb_do_takeip_state *state;
489 if (vnn->update_in_flight) {
490 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
491 "update for this IP already in flight\n",
492 ctdb_addr_to_str(&vnn->public_address),
493 vnn->public_netmask_bits));
497 ret = ctdb_vnn_assign_iface(ctdb, vnn);
499 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
500 "assign a usable interface\n",
501 ctdb_addr_to_str(&vnn->public_address),
502 vnn->public_netmask_bits));
506 state = talloc(vnn, struct ctdb_do_takeip_state);
507 CTDB_NO_MEMORY(ctdb, state);
509 state->c = talloc_steal(ctdb, c);
512 vnn->update_in_flight = true;
513 talloc_set_destructor(state, ctdb_takeip_destructor);
515 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
516 ctdb_addr_to_str(&vnn->public_address),
517 vnn->public_netmask_bits,
518 ctdb_vnn_iface_string(vnn)));
520 ret = ctdb_event_script_callback(ctdb,
522 ctdb_do_takeip_callback,
526 ctdb_vnn_iface_string(vnn),
527 ctdb_addr_to_str(&vnn->public_address),
528 vnn->public_netmask_bits);
531 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
532 ctdb_addr_to_str(&vnn->public_address),
533 ctdb_vnn_iface_string(vnn)));
541 struct ctdb_do_updateip_state {
542 struct ctdb_req_control_old *c;
543 struct ctdb_interface *old;
544 struct ctdb_vnn *vnn;
548 called when updateip event finishes
550 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
553 struct ctdb_do_updateip_state *state =
554 talloc_get_type(private_data, struct ctdb_do_updateip_state);
558 if (status == -ETIME) {
561 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
562 ctdb_addr_to_str(&state->vnn->public_address),
564 ctdb_vnn_iface_string(state->vnn)));
567 * All we can do is reset the old interface
568 * and let the next run fix it
570 ctdb_vnn_unassign_iface(ctdb, state->vnn);
571 state->vnn->iface = state->old;
572 state->vnn->iface->references++;
574 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
579 if (ctdb->do_checkpublicip) {
581 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
583 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
590 /* the control succeeded */
591 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
596 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
598 state->vnn->update_in_flight = false;
603 update (move) an ip address
605 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
606 struct ctdb_req_control_old *c,
607 struct ctdb_vnn *vnn)
610 struct ctdb_do_updateip_state *state;
611 struct ctdb_interface *old = vnn->iface;
612 const char *new_name;
614 if (vnn->update_in_flight) {
615 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
616 "update for this IP already in flight\n",
617 ctdb_addr_to_str(&vnn->public_address),
618 vnn->public_netmask_bits));
622 ctdb_vnn_unassign_iface(ctdb, vnn);
623 ret = ctdb_vnn_assign_iface(ctdb, vnn);
625 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
626 "assin a usable interface (old iface '%s')\n",
627 ctdb_addr_to_str(&vnn->public_address),
628 vnn->public_netmask_bits,
633 new_name = ctdb_vnn_iface_string(vnn);
634 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
635 /* A benign update from one interface onto itself.
636 * no need to run the eventscripts in this case, just return
639 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
643 state = talloc(vnn, struct ctdb_do_updateip_state);
644 CTDB_NO_MEMORY(ctdb, state);
646 state->c = talloc_steal(ctdb, c);
650 vnn->update_in_flight = true;
651 talloc_set_destructor(state, ctdb_updateip_destructor);
653 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
654 "interface %s to %s\n",
655 ctdb_addr_to_str(&vnn->public_address),
656 vnn->public_netmask_bits,
660 ret = ctdb_event_script_callback(ctdb,
662 ctdb_do_updateip_callback,
664 CTDB_EVENT_UPDATE_IP,
668 ctdb_addr_to_str(&vnn->public_address),
669 vnn->public_netmask_bits);
671 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 old->name, new_name));
682 Find the vnn of the node that has a public ip address
683 returns -1 if the address is not known as a public address
685 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
687 struct ctdb_vnn *vnn;
689 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
690 if (ctdb_same_ip(&vnn->public_address, addr)) {
699 take over an ip address
701 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
702 struct ctdb_req_control_old *c,
707 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
708 struct ctdb_vnn *vnn;
709 bool have_ip = false;
710 bool do_updateip = false;
711 bool do_takeip = false;
712 struct ctdb_interface *best_iface = NULL;
714 if (pip->pnn != ctdb->pnn) {
715 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
716 "with pnn %d, but we're node %d\n",
717 ctdb_addr_to_str(&pip->addr),
718 pip->pnn, ctdb->pnn));
722 /* update out vnn list */
723 vnn = find_public_ip_vnn(ctdb, &pip->addr);
725 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
726 ctdb_addr_to_str(&pip->addr)));
730 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
731 have_ip = ctdb_sys_have_ip(&pip->addr);
733 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
734 if (best_iface == NULL) {
735 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
736 "a usable interface (old %s, have_ip %d)\n",
737 ctdb_addr_to_str(&vnn->public_address),
738 vnn->public_netmask_bits,
739 ctdb_vnn_iface_string(vnn),
744 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
745 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
750 if (vnn->iface == NULL && have_ip) {
751 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
753 ctdb_addr_to_str(&vnn->public_address)));
757 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
758 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
759 "and we have it on iface[%s], but it was assigned to node %d"
760 "and we are node %d, banning ourself\n",
761 ctdb_addr_to_str(&vnn->public_address),
762 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
767 if (vnn->pnn == -1 && have_ip) {
768 vnn->pnn = ctdb->pnn;
769 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
770 "and we already have it on iface[%s], update local daemon\n",
771 ctdb_addr_to_str(&vnn->public_address),
772 ctdb_vnn_iface_string(vnn)));
777 if (vnn->iface != best_iface) {
778 if (!vnn->iface->link_up) {
780 } else if (vnn->iface->references > (best_iface->references + 1)) {
781 /* only move when the rebalance gains something */
789 ctdb_vnn_unassign_iface(ctdb, vnn);
796 ret = ctdb_do_takeip(ctdb, c, vnn);
800 } else if (do_updateip) {
801 ret = ctdb_do_updateip(ctdb, c, vnn);
807 * The interface is up and the kernel known the ip
810 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
811 ctdb_addr_to_str(&pip->addr),
812 vnn->public_netmask_bits,
813 ctdb_vnn_iface_string(vnn)));
817 /* tell ctdb_control.c that we will be replying asynchronously */
824 kill any clients that are registered with a IP that is being released
826 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
828 struct ctdb_client_ip *ip;
830 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
831 ctdb_addr_to_str(addr)));
833 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
834 ctdb_sock_addr tmp_addr;
837 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
839 ctdb_addr_to_str(&ip->addr)));
841 if (ctdb_same_ip(&tmp_addr, addr)) {
842 struct ctdb_client *client = reqid_find(ctdb->idr,
845 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
847 ctdb_addr_to_str(&ip->addr),
850 if (client->pid != 0) {
851 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
852 (unsigned)client->pid,
853 ctdb_addr_to_str(addr),
855 kill(client->pid, SIGKILL);
861 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
863 DLIST_REMOVE(ctdb->vnn, vnn);
864 ctdb_vnn_unassign_iface(ctdb, vnn);
865 ctdb_remove_orphaned_ifaces(ctdb, vnn);
870 called when releaseip event finishes
872 static void release_ip_callback(struct ctdb_context *ctdb, int status,
875 struct takeover_callback_state *state =
876 talloc_get_type(private_data, struct takeover_callback_state);
879 if (status == -ETIME) {
883 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
884 if (ctdb_sys_have_ip(state->addr)) {
886 ("IP %s still hosted during release IP callback, failing\n",
887 ctdb_addr_to_str(state->addr)));
888 ctdb_request_control_reply(ctdb, state->c,
895 /* send a message to all clients of this node telling them
896 that the cluster has been reconfigured and they should
897 release any sockets on this IP */
898 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
899 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
900 data.dsize = strlen((char *)data.dptr)+1;
902 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
904 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
906 /* kill clients that have registered with this IP */
907 release_kill_clients(ctdb, state->addr);
909 ctdb_vnn_unassign_iface(ctdb, state->vnn);
911 /* Process the IP if it has been marked for deletion */
912 if (state->vnn->delete_pending) {
913 do_delete_ip(ctdb, state->vnn);
917 /* the control succeeded */
918 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
922 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
924 if (state->vnn != NULL) {
925 state->vnn->update_in_flight = false;
931 release an ip address
933 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
934 struct ctdb_req_control_old *c,
939 struct takeover_callback_state *state;
940 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
941 struct ctdb_vnn *vnn;
944 /* update our vnn list */
945 vnn = find_public_ip_vnn(ctdb, &pip->addr);
947 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
948 ctdb_addr_to_str(&pip->addr)));
953 /* stop any previous arps */
954 talloc_free(vnn->takeover_ctx);
955 vnn->takeover_ctx = NULL;
957 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
958 * lazy multicast to drop an IP from any node that isn't the
959 * intended new node. The following causes makes ctdbd ignore
960 * a release for any address it doesn't host.
962 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
963 if (!ctdb_sys_have_ip(&pip->addr)) {
964 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
965 ctdb_addr_to_str(&pip->addr),
966 vnn->public_netmask_bits,
967 ctdb_vnn_iface_string(vnn)));
968 ctdb_vnn_unassign_iface(ctdb, vnn);
972 if (vnn->iface == NULL) {
973 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
974 ctdb_addr_to_str(&pip->addr),
975 vnn->public_netmask_bits));
980 /* There is a potential race between take_ip and us because we
981 * update the VNN via a callback that run when the
982 * eventscripts have been run. Avoid the race by allowing one
983 * update to be in flight at a time.
985 if (vnn->update_in_flight) {
986 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
987 "update for this IP already in flight\n",
988 ctdb_addr_to_str(&vnn->public_address),
989 vnn->public_netmask_bits));
993 iface = strdup(ctdb_vnn_iface_string(vnn));
995 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
996 ctdb_addr_to_str(&pip->addr),
997 vnn->public_netmask_bits,
1001 state = talloc(ctdb, struct takeover_callback_state);
1002 if (state == NULL) {
1003 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1004 __FILE__, __LINE__);
1009 state->c = talloc_steal(state, c);
1010 state->addr = talloc(state, ctdb_sock_addr);
1011 if (state->addr == NULL) {
1012 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1013 __FILE__, __LINE__);
1018 *state->addr = pip->addr;
1021 vnn->update_in_flight = true;
1022 talloc_set_destructor(state, ctdb_releaseip_destructor);
1024 ret = ctdb_event_script_callback(ctdb,
1025 state, release_ip_callback, state,
1026 CTDB_EVENT_RELEASE_IP,
1029 ctdb_addr_to_str(&pip->addr),
1030 vnn->public_netmask_bits);
1033 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1034 ctdb_addr_to_str(&pip->addr),
1035 ctdb_vnn_iface_string(vnn)));
1040 /* tell the control that we will be reply asynchronously */
1041 *async_reply = true;
1045 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1046 ctdb_sock_addr *addr,
1047 unsigned mask, const char *ifaces,
1050 struct ctdb_vnn *vnn;
1057 tmp = strdup(ifaces);
1058 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1059 if (!ctdb_sys_check_iface_exists(iface)) {
1060 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1067 /* Verify that we don't have an entry for this ip yet */
1068 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1069 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1070 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1071 ctdb_addr_to_str(addr)));
1076 /* create a new vnn structure for this ip address */
1077 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1078 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1079 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1080 tmp = talloc_strdup(vnn, ifaces);
1081 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1082 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1083 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1084 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1085 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1086 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1090 vnn->ifaces[num] = NULL;
1091 vnn->public_address = *addr;
1092 vnn->public_netmask_bits = mask;
1094 if (check_address) {
1095 if (ctdb_sys_have_ip(addr)) {
1096 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1097 vnn->pnn = ctdb->pnn;
1101 for (i=0; vnn->ifaces[i]; i++) {
1102 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1104 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1105 "for public_address[%s]\n",
1106 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1112 DLIST_ADD(ctdb->vnn, vnn);
1118 setup the public address lists from a file
1120 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1126 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1127 if (lines == NULL) {
1128 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1131 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1135 for (i=0;i<nlines;i++) {
1137 ctdb_sock_addr addr;
1138 const char *addrstr;
1143 while ((*line == ' ') || (*line == '\t')) {
1149 if (strcmp(line, "") == 0) {
1152 tok = strtok(line, " \t");
1154 tok = strtok(NULL, " \t");
1156 if (NULL == ctdb->default_public_interface) {
1157 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1162 ifaces = ctdb->default_public_interface;
1167 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1168 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1172 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1173 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1184 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1188 struct ctdb_vnn *svnn;
1189 struct ctdb_interface *cur = NULL;
1193 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1194 CTDB_NO_MEMORY(ctdb, svnn);
1196 svnn->ifaces = talloc_array(svnn, const char *, 2);
1197 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1198 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1199 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1200 svnn->ifaces[1] = NULL;
1202 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1208 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1210 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1211 "for single_ip[%s]\n",
1213 ctdb_addr_to_str(&svnn->public_address)));
1218 /* assume the single public ip interface is initially "good" */
1219 cur = ctdb_find_iface(ctdb, iface);
1221 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1224 cur->link_up = true;
1226 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1232 ctdb->single_ip_vnn = svnn;
1236 struct public_ip_list {
1237 struct public_ip_list *next;
1239 ctdb_sock_addr addr;
1242 /* Given a physical node, return the number of
1243 public addresses that is currently assigned to this node.
1245 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1249 for (;ips;ips=ips->next) {
1250 if (ips->pnn == pnn) {
1258 /* Can the given node host the given IP: is the public IP known to the
1259 * node and is NOIPHOST unset?
1261 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1262 struct ctdb_ipflags ipflags,
1263 struct public_ip_list *ip)
1265 struct ctdb_public_ip_list_old *public_ips;
1268 if (ipflags.noiphost) {
1272 public_ips = ctdb->nodes[pnn]->available_public_ips;
1274 if (public_ips == NULL) {
1278 for (i=0; i<public_ips->num; i++) {
1279 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1280 /* yes, this node can serve this public ip */
1288 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1289 struct ctdb_ipflags ipflags,
1290 struct public_ip_list *ip)
1292 if (ipflags.noiptakeover) {
1296 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1299 /* search the node lists list for a node to takeover this ip.
1300 pick the node that currently are serving the least number of ips
1301 so that the ips get spread out evenly.
1303 static int find_takeover_node(struct ctdb_context *ctdb,
1304 struct ctdb_ipflags *ipflags,
1305 struct public_ip_list *ip,
1306 struct public_ip_list *all_ips)
1308 int pnn, min=0, num;
1311 numnodes = talloc_array_length(ipflags);
1313 for (i=0; i<numnodes; i++) {
1314 /* verify that this node can serve this ip */
1315 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1316 /* no it couldnt so skip to the next node */
1320 num = node_ip_coverage(i, all_ips);
1321 /* was this the first node we checked ? */
1333 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1334 ctdb_addr_to_str(&ip->addr)));
1344 static uint32_t *ip_key(ctdb_sock_addr *ip)
1346 static uint32_t key[IP_KEYLEN];
1348 bzero(key, sizeof(key));
1350 switch (ip->sa.sa_family) {
1352 key[3] = htonl(ip->ip.sin_addr.s_addr);
1355 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1356 key[0] = htonl(s6_a32[0]);
1357 key[1] = htonl(s6_a32[1]);
1358 key[2] = htonl(s6_a32[2]);
1359 key[3] = htonl(s6_a32[3]);
1363 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1370 static void *add_ip_callback(void *parm, void *data)
1372 struct public_ip_list *this_ip = parm;
1373 struct public_ip_list *prev_ip = data;
1375 if (prev_ip == NULL) {
1378 if (this_ip->pnn == -1) {
1379 this_ip->pnn = prev_ip->pnn;
1385 static int getips_count_callback(void *param, void *data)
1387 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1388 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1390 new_ip->next = *ip_list;
1395 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1396 struct ctdb_public_ip_list_old *ips,
1399 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1400 struct ctdb_node_map_old *nodemap)
1405 if (ctdb->num_nodes != nodemap->num) {
1406 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1407 ctdb->num_nodes, nodemap->num));
1411 for (j=0; j<nodemap->num; j++) {
1412 /* For readability */
1413 struct ctdb_node *node = ctdb->nodes[j];
1415 /* release any existing data */
1416 TALLOC_FREE(node->known_public_ips);
1417 TALLOC_FREE(node->available_public_ips);
1419 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1423 /* Retrieve the list of known public IPs from the node */
1424 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1429 &node->known_public_ips);
1432 ("Failed to read known public IPs from node: %u\n",
1437 if (ctdb->do_checkpublicip) {
1438 verify_remote_ip_allocation(ctdb,
1439 node->known_public_ips,
1443 /* Retrieve the list of available public IPs from the node */
1444 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1448 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1449 &node->available_public_ips);
1452 ("Failed to read available public IPs from node: %u\n",
1461 static struct public_ip_list *
1462 create_merged_ip_list(struct ctdb_context *ctdb)
1465 struct public_ip_list *ip_list;
1466 struct ctdb_public_ip_list_old *public_ips;
1468 if (ctdb->ip_tree != NULL) {
1469 talloc_free(ctdb->ip_tree);
1470 ctdb->ip_tree = NULL;
1472 ctdb->ip_tree = trbt_create(ctdb, 0);
1474 for (i=0;i<ctdb->num_nodes;i++) {
1475 public_ips = ctdb->nodes[i]->known_public_ips;
1477 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1481 /* there were no public ips for this node */
1482 if (public_ips == NULL) {
1486 for (j=0;j<public_ips->num;j++) {
1487 struct public_ip_list *tmp_ip;
1489 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1490 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1491 /* Do not use information about IP addresses hosted
1492 * on other nodes, it may not be accurate */
1493 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1494 tmp_ip->pnn = public_ips->ips[j].pnn;
1498 tmp_ip->addr = public_ips->ips[j].addr;
1499 tmp_ip->next = NULL;
1501 trbt_insertarray32_callback(ctdb->ip_tree,
1502 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1509 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1515 * This is the length of the longtest common prefix between the IPs.
1516 * It is calculated by XOR-ing the 2 IPs together and counting the
1517 * number of leading zeroes. The implementation means that all
1518 * addresses end up being 128 bits long.
1520 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1521 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1522 * lots of nodes and IP addresses?
1524 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1526 uint32_t ip1_k[IP_KEYLEN];
1531 uint32_t distance = 0;
1533 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1535 for (i=0; i<IP_KEYLEN; i++) {
1536 x = ip1_k[i] ^ t[i];
1540 /* Count number of leading zeroes.
1541 * FIXME? This could be optimised...
1543 while ((x & (1 << 31)) == 0) {
1553 /* Calculate the IP distance for the given IP relative to IPs on the
1554 given node. The ips argument is generally the all_ips variable
1555 used in the main part of the algorithm.
1557 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1558 struct public_ip_list *ips,
1561 struct public_ip_list *t;
1566 for (t=ips; t != NULL; t=t->next) {
1567 if (t->pnn != pnn) {
1571 /* Optimisation: We never calculate the distance
1572 * between an address and itself. This allows us to
1573 * calculate the effect of removing an address from a
1574 * node by simply calculating the distance between
1575 * that address and all of the exitsing addresses.
1576 * Moreover, we assume that we're only ever dealing
1577 * with addresses from all_ips so we can identify an
1578 * address via a pointer rather than doing a more
1579 * expensive address comparison. */
1580 if (&(t->addr) == ip) {
1584 d = ip_distance(ip, &(t->addr));
1585 sum += d * d; /* Cheaper than pulling in math.h :-) */
1591 /* Return the LCP2 imbalance metric for addresses currently assigned
1594 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1596 struct public_ip_list *t;
1598 uint32_t imbalance = 0;
1600 for (t=all_ips; t!=NULL; t=t->next) {
1601 if (t->pnn != pnn) {
1604 /* Pass the rest of the IPs rather than the whole
1607 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1613 /* Allocate any unassigned IPs just by looping through the IPs and
1614 * finding the best node for each.
1616 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1617 struct ctdb_ipflags *ipflags,
1618 struct public_ip_list *all_ips)
1620 struct public_ip_list *tmp_ip;
1622 /* loop over all ip's and find a physical node to cover for
1625 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1626 if (tmp_ip->pnn == -1) {
1627 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1628 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1629 ctdb_addr_to_str(&tmp_ip->addr)));
1635 /* Basic non-deterministic rebalancing algorithm.
1637 static void basic_failback(struct ctdb_context *ctdb,
1638 struct ctdb_ipflags *ipflags,
1639 struct public_ip_list *all_ips,
1643 int maxnode, maxnum, minnode, minnum, num, retries;
1644 struct public_ip_list *tmp_ip;
1646 numnodes = talloc_array_length(ipflags);
1653 /* for each ip address, loop over all nodes that can serve
1654 this ip and make sure that the difference between the node
1655 serving the most and the node serving the least ip's are
1658 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1659 if (tmp_ip->pnn == -1) {
1663 /* Get the highest and lowest number of ips's served by any
1664 valid node which can serve this ip.
1668 for (i=0; i<numnodes; i++) {
1669 /* only check nodes that can actually serve this ip */
1670 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1671 /* no it couldnt so skip to the next node */
1675 num = node_ip_coverage(i, all_ips);
1676 if (maxnode == -1) {
1685 if (minnode == -1) {
1695 if (maxnode == -1) {
1696 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1697 ctdb_addr_to_str(&tmp_ip->addr)));
1702 /* if the spread between the smallest and largest coverage by
1703 a node is >=2 we steal one of the ips from the node with
1704 most coverage to even things out a bit.
1705 try to do this a limited number of times since we dont
1706 want to spend too much time balancing the ip coverage.
1708 if ( (maxnum > minnum+1)
1709 && (retries < (num_ips + 5)) ){
1710 struct public_ip_list *tmp;
1712 /* Reassign one of maxnode's VNNs */
1713 for (tmp=all_ips;tmp;tmp=tmp->next) {
1714 if (tmp->pnn == maxnode) {
1715 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1724 static void lcp2_init(struct ctdb_context *tmp_ctx,
1725 struct ctdb_ipflags *ipflags,
1726 struct public_ip_list *all_ips,
1727 uint32_t *force_rebalance_nodes,
1728 uint32_t **lcp2_imbalances,
1729 bool **rebalance_candidates)
1732 struct public_ip_list *tmp_ip;
1734 numnodes = talloc_array_length(ipflags);
1736 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1737 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1738 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1739 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1741 for (i=0; i<numnodes; i++) {
1742 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1743 /* First step: assume all nodes are candidates */
1744 (*rebalance_candidates)[i] = true;
1747 /* 2nd step: if a node has IPs assigned then it must have been
1748 * healthy before, so we remove it from consideration. This
1749 * is overkill but is all we have because we don't maintain
1750 * state between takeover runs. An alternative would be to
1751 * keep state and invalidate it every time the recovery master
1754 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1755 if (tmp_ip->pnn != -1) {
1756 (*rebalance_candidates)[tmp_ip->pnn] = false;
1760 /* 3rd step: if a node is forced to re-balance then
1761 we allow failback onto the node */
1762 if (force_rebalance_nodes == NULL) {
1765 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1766 uint32_t pnn = force_rebalance_nodes[i];
1767 if (pnn >= numnodes) {
1769 (__location__ "unknown node %u\n", pnn));
1774 ("Forcing rebalancing of IPs to node %u\n", pnn));
1775 (*rebalance_candidates)[pnn] = true;
1779 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1780 * the IP/node combination that will cost the least.
1782 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1783 struct ctdb_ipflags *ipflags,
1784 struct public_ip_list *all_ips,
1785 uint32_t *lcp2_imbalances)
1787 struct public_ip_list *tmp_ip;
1788 int dstnode, numnodes;
1791 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1792 struct public_ip_list *minip;
1794 bool should_loop = true;
1795 bool have_unassigned = true;
1797 numnodes = talloc_array_length(ipflags);
1799 while (have_unassigned && should_loop) {
1800 should_loop = false;
1802 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1803 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1809 /* loop over each unassigned ip. */
1810 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1811 if (tmp_ip->pnn != -1) {
1815 for (dstnode=0; dstnode<numnodes; dstnode++) {
1816 /* only check nodes that can actually takeover this ip */
1817 if (!can_node_takeover_ip(ctdb, dstnode,
1820 /* no it couldnt so skip to the next node */
1824 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1825 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1826 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1827 ctdb_addr_to_str(&(tmp_ip->addr)),
1829 dstimbl - lcp2_imbalances[dstnode]));
1832 if ((minnode == -1) || (dstdsum < mindsum)) {
1842 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1844 /* If we found one then assign it to the given node. */
1845 if (minnode != -1) {
1846 minip->pnn = minnode;
1847 lcp2_imbalances[minnode] = minimbl;
1848 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1849 ctdb_addr_to_str(&(minip->addr)),
1854 /* There might be a better way but at least this is clear. */
1855 have_unassigned = false;
1856 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1857 if (tmp_ip->pnn == -1) {
1858 have_unassigned = true;
1863 /* We know if we have an unassigned addresses so we might as
1866 if (have_unassigned) {
1867 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1868 if (tmp_ip->pnn == -1) {
1869 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1870 ctdb_addr_to_str(&tmp_ip->addr)));
1876 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1877 * to move IPs from, determines the best IP/destination node
1878 * combination to move from the source node.
1880 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1881 struct ctdb_ipflags *ipflags,
1882 struct public_ip_list *all_ips,
1884 uint32_t *lcp2_imbalances,
1885 bool *rebalance_candidates)
1887 int dstnode, mindstnode, numnodes;
1888 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1889 uint32_t minsrcimbl, mindstimbl;
1890 struct public_ip_list *minip;
1891 struct public_ip_list *tmp_ip;
1893 /* Find an IP and destination node that best reduces imbalance. */
1900 numnodes = talloc_array_length(ipflags);
1902 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1903 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1904 srcnode, lcp2_imbalances[srcnode]));
1906 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1907 /* Only consider addresses on srcnode. */
1908 if (tmp_ip->pnn != srcnode) {
1912 /* What is this IP address costing the source node? */
1913 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1914 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1916 /* Consider this IP address would cost each potential
1917 * destination node. Destination nodes are limited to
1918 * those that are newly healthy, since we don't want
1919 * to do gratuitous failover of IPs just to make minor
1920 * balance improvements.
1922 for (dstnode=0; dstnode<numnodes; dstnode++) {
1923 if (!rebalance_candidates[dstnode]) {
1927 /* only check nodes that can actually takeover this ip */
1928 if (!can_node_takeover_ip(ctdb, dstnode,
1929 ipflags[dstnode], tmp_ip)) {
1930 /* no it couldnt so skip to the next node */
1934 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1935 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1936 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1938 ctdb_addr_to_str(&(tmp_ip->addr)),
1941 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1942 (dstdsum < srcdsum) && \
1943 ((mindstnode == -1) || \
1944 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1947 minsrcimbl = srcimbl;
1948 mindstnode = dstnode;
1949 mindstimbl = dstimbl;
1953 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1955 if (mindstnode != -1) {
1956 /* We found a move that makes things better... */
1957 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1958 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1959 ctdb_addr_to_str(&(minip->addr)),
1960 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1963 lcp2_imbalances[srcnode] = minsrcimbl;
1964 lcp2_imbalances[mindstnode] = mindstimbl;
1965 minip->pnn = mindstnode;
1974 struct lcp2_imbalance_pnn {
1979 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1981 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1982 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1984 if (lipa->imbalance > lipb->imbalance) {
1986 } else if (lipa->imbalance == lipb->imbalance) {
1993 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1994 * node with the highest LCP2 imbalance, and then determines the best
1995 * IP/destination node combination to move from the source node.
1997 static void lcp2_failback(struct ctdb_context *ctdb,
1998 struct ctdb_ipflags *ipflags,
1999 struct public_ip_list *all_ips,
2000 uint32_t *lcp2_imbalances,
2001 bool *rebalance_candidates)
2004 struct lcp2_imbalance_pnn * lips;
2007 numnodes = talloc_array_length(ipflags);
2010 /* Put the imbalances and nodes into an array, sort them and
2011 * iterate through candidates. Usually the 1st one will be
2012 * used, so this doesn't cost much...
2014 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2015 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2016 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2017 for (i=0; i<numnodes; i++) {
2018 lips[i].imbalance = lcp2_imbalances[i];
2020 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2022 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2023 lcp2_cmp_imbalance_pnn);
2026 for (i=0; i<numnodes; i++) {
2027 /* This means that all nodes had 0 or 1 addresses, so
2028 * can't be imbalanced.
2030 if (lips[i].imbalance == 0) {
2034 if (lcp2_failback_candidate(ctdb,
2039 rebalance_candidates)) {
2051 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2052 struct ctdb_ipflags *ipflags,
2053 struct public_ip_list *all_ips)
2055 struct public_ip_list *tmp_ip;
2057 /* verify that the assigned nodes can serve that public ip
2058 and set it to -1 if not
2060 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2061 if (tmp_ip->pnn == -1) {
2064 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2065 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2066 /* this node can not serve this ip. */
2067 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2068 ctdb_addr_to_str(&(tmp_ip->addr)),
2075 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2076 struct ctdb_ipflags *ipflags,
2077 struct public_ip_list *all_ips)
2079 struct public_ip_list *tmp_ip;
2082 numnodes = talloc_array_length(ipflags);
2084 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2085 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2086 * always be allocated the same way for a specific set of
2087 * available/unavailable nodes.
2090 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2091 tmp_ip->pnn = i % numnodes;
2094 /* IP failback doesn't make sense with deterministic
2095 * IPs, since the modulo step above implicitly fails
2096 * back IPs to their "home" node.
2098 if (1 == ctdb->tunable.no_ip_failback) {
2099 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2102 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2104 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2106 /* No failback here! */
2109 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2110 struct ctdb_ipflags *ipflags,
2111 struct public_ip_list *all_ips)
2113 /* This should be pushed down into basic_failback. */
2114 struct public_ip_list *tmp_ip;
2116 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2120 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2122 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2124 /* If we don't want IPs to fail back then don't rebalance IPs. */
2125 if (1 == ctdb->tunable.no_ip_failback) {
2129 /* Now, try to make sure the ip adresses are evenly distributed
2132 basic_failback(ctdb, ipflags, all_ips, num_ips);
2135 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2136 struct ctdb_ipflags *ipflags,
2137 struct public_ip_list *all_ips,
2138 uint32_t *force_rebalance_nodes)
2140 uint32_t *lcp2_imbalances;
2141 bool *rebalance_candidates;
2142 int numnodes, num_rebalance_candidates, i;
2144 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2146 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2148 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2149 &lcp2_imbalances, &rebalance_candidates);
2151 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2153 /* If we don't want IPs to fail back then don't rebalance IPs. */
2154 if (1 == ctdb->tunable.no_ip_failback) {
2158 /* It is only worth continuing if we have suitable target
2159 * nodes to transfer IPs to. This check is much cheaper than
2162 numnodes = talloc_array_length(ipflags);
2163 num_rebalance_candidates = 0;
2164 for (i=0; i<numnodes; i++) {
2165 if (rebalance_candidates[i]) {
2166 num_rebalance_candidates++;
2169 if (num_rebalance_candidates == 0) {
2173 /* Now, try to make sure the ip adresses are evenly distributed
2176 lcp2_failback(ctdb, ipflags, all_ips,
2177 lcp2_imbalances, rebalance_candidates);
2180 talloc_free(tmp_ctx);
2183 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2187 for (i=0;i<nodemap->num;i++) {
2188 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2189 /* Found one completely healthy node */
2197 /* The calculation part of the IP allocation algorithm. */
2198 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2199 struct ctdb_ipflags *ipflags,
2200 struct public_ip_list **all_ips_p,
2201 uint32_t *force_rebalance_nodes)
2203 /* since nodes only know about those public addresses that
2204 can be served by that particular node, no single node has
2205 a full list of all public addresses that exist in the cluster.
2206 Walk over all node structures and create a merged list of
2207 all public addresses that exist in the cluster.
2209 keep the tree of ips around as ctdb->ip_tree
2211 *all_ips_p = create_merged_ip_list(ctdb);
2213 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2214 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2215 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2216 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2218 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2221 /* at this point ->pnn is the node which will own each IP
2222 or -1 if there is no node that can cover this ip
2228 struct get_tunable_callback_data {
2229 const char *tunable;
2234 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2235 int32_t res, TDB_DATA outdata,
2238 struct get_tunable_callback_data *cd =
2239 (struct get_tunable_callback_data *)callback;
2243 /* Already handled in fail callback */
2247 if (outdata.dsize != sizeof(uint32_t)) {
2248 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2249 cd->tunable, pnn, (int)sizeof(uint32_t),
2250 (int)outdata.dsize));
2255 size = talloc_array_length(cd->out);
2257 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2258 cd->tunable, pnn, size));
2263 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2266 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2267 int32_t res, TDB_DATA outdata,
2270 struct get_tunable_callback_data *cd =
2271 (struct get_tunable_callback_data *)callback;
2276 ("Timed out getting tunable \"%s\" from node %d\n",
2282 DEBUG(DEBUG_WARNING,
2283 ("Tunable \"%s\" not implemented on node %d\n",
2288 ("Unexpected error getting tunable \"%s\" from node %d\n",
2294 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2295 TALLOC_CTX *tmp_ctx,
2296 struct ctdb_node_map_old *nodemap,
2297 const char *tunable,
2298 uint32_t default_value)
2301 struct ctdb_control_get_tunable *t;
2304 struct get_tunable_callback_data callback_data;
2307 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2308 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2309 for (i=0; i<nodemap->num; i++) {
2310 tvals[i] = default_value;
2313 callback_data.out = tvals;
2314 callback_data.tunable = tunable;
2315 callback_data.fatal = false;
2317 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2318 data.dptr = talloc_size(tmp_ctx, data.dsize);
2319 t = (struct ctdb_control_get_tunable *)data.dptr;
2320 t->length = strlen(tunable)+1;
2321 memcpy(t->name, tunable, t->length);
2322 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2323 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2324 nodes, 0, TAKEOVER_TIMEOUT(),
2326 get_tunable_callback,
2327 get_tunable_fail_callback,
2328 &callback_data) != 0) {
2329 if (callback_data.fatal) {
2335 talloc_free(data.dptr);
2340 /* Set internal flags for IP allocation:
2342 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2343 * Set NOIPHOST ip flag for each INACTIVE node
2344 * if all nodes are disabled:
2345 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2347 * Set NOIPHOST ip flags for disabled nodes
2349 static struct ctdb_ipflags *
2350 set_ipflags_internal(struct ctdb_context *ctdb,
2351 TALLOC_CTX *tmp_ctx,
2352 struct ctdb_node_map_old *nodemap,
2353 uint32_t *tval_noiptakeover,
2354 uint32_t *tval_noiphostonalldisabled)
2357 struct ctdb_ipflags *ipflags;
2359 /* Clear IP flags - implicit due to talloc_zero */
2360 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2361 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2363 for (i=0;i<nodemap->num;i++) {
2364 /* Can not take IPs on node with NoIPTakeover set */
2365 if (tval_noiptakeover[i] != 0) {
2366 ipflags[i].noiptakeover = true;
2369 /* Can not host IPs on INACTIVE node */
2370 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2371 ipflags[i].noiphost = true;
2375 if (all_nodes_are_disabled(nodemap)) {
2376 /* If all nodes are disabled, can not host IPs on node
2377 * with NoIPHostOnAllDisabled set
2379 for (i=0;i<nodemap->num;i++) {
2380 if (tval_noiphostonalldisabled[i] != 0) {
2381 ipflags[i].noiphost = true;
2385 /* If some nodes are not disabled, then can not host
2386 * IPs on DISABLED node
2388 for (i=0;i<nodemap->num;i++) {
2389 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2390 ipflags[i].noiphost = true;
2398 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2399 TALLOC_CTX *tmp_ctx,
2400 struct ctdb_node_map_old *nodemap)
2402 uint32_t *tval_noiptakeover;
2403 uint32_t *tval_noiphostonalldisabled;
2404 struct ctdb_ipflags *ipflags;
2407 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2409 if (tval_noiptakeover == NULL) {
2413 tval_noiphostonalldisabled =
2414 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2415 "NoIPHostOnAllDisabled", 0);
2416 if (tval_noiphostonalldisabled == NULL) {
2417 /* Caller frees tmp_ctx */
2421 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2423 tval_noiphostonalldisabled);
2425 talloc_free(tval_noiptakeover);
2426 talloc_free(tval_noiphostonalldisabled);
2431 struct iprealloc_callback_data {
2434 client_async_callback fail_callback;
2435 void *fail_callback_data;
2436 struct ctdb_node_map_old *nodemap;
2439 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2440 int32_t res, TDB_DATA outdata,
2444 struct iprealloc_callback_data *cd =
2445 (struct iprealloc_callback_data *)callback;
2447 numnodes = talloc_array_length(cd->retry_nodes);
2448 if (pnn > numnodes) {
2450 ("ipreallocated failure from node %d, "
2451 "but only %d nodes in nodemap\n",
2456 /* Can't run the "ipreallocated" event on a INACTIVE node */
2457 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2458 DEBUG(DEBUG_WARNING,
2459 ("ipreallocated failed on inactive node %d, ignoring\n",
2466 /* If the control timed out then that's a real error,
2467 * so call the real fail callback
2469 if (cd->fail_callback) {
2470 cd->fail_callback(ctdb, pnn, res, outdata,
2471 cd->fail_callback_data);
2473 DEBUG(DEBUG_WARNING,
2474 ("iprealloc timed out but no callback registered\n"));
2478 /* If not a timeout then either the ipreallocated
2479 * eventscript (or some setup) failed. This might
2480 * have failed because the IPREALLOCATED control isn't
2481 * implemented - right now there is no way of knowing
2482 * because the error codes are all folded down to -1.
2483 * Consider retrying using EVENTSCRIPT control...
2485 DEBUG(DEBUG_WARNING,
2486 ("ipreallocated failure from node %d, flagging retry\n",
2488 cd->retry_nodes[pnn] = true;
2493 struct takeover_callback_data {
2495 client_async_callback fail_callback;
2496 void *fail_callback_data;
2497 struct ctdb_node_map_old *nodemap;
2500 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2501 uint32_t node_pnn, int32_t res,
2502 TDB_DATA outdata, void *callback_data)
2504 struct takeover_callback_data *cd =
2505 talloc_get_type_abort(callback_data,
2506 struct takeover_callback_data);
2509 for (i = 0; i < cd->nodemap->num; i++) {
2510 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2515 if (i == cd->nodemap->num) {
2516 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2520 if (!cd->node_failed[i]) {
2521 cd->node_failed[i] = true;
2522 cd->fail_callback(ctdb, node_pnn, res, outdata,
2523 cd->fail_callback_data);
2528 make any IP alias changes for public addresses that are necessary
2530 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2531 uint32_t *force_rebalance_nodes,
2532 client_async_callback fail_callback, void *callback_data)
2535 struct ctdb_public_ip ip;
2537 struct public_ip_list *all_ips, *tmp_ip;
2539 struct timeval timeout;
2540 struct client_async_data *async_data;
2541 struct ctdb_client_control_state *state;
2542 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2543 struct ctdb_ipflags *ipflags;
2544 struct takeover_callback_data *takeover_data;
2545 struct iprealloc_callback_data iprealloc_data;
2550 * ip failover is completely disabled, just send out the
2551 * ipreallocated event.
2553 if (ctdb->tunable.disable_ip_failover != 0) {
2557 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2558 if (ipflags == NULL) {
2559 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2560 talloc_free(tmp_ctx);
2564 /* Fetch known/available public IPs from each active node */
2565 ret = ctdb_reload_remote_public_ips(ctdb, nodemap);
2567 talloc_free(tmp_ctx);
2571 /* Short-circuit IP allocation if no node has available IPs */
2572 can_host_ips = false;
2573 for (i=0; i < ctdb->num_nodes; i++) {
2574 if (ctdb->nodes[i]->available_public_ips != NULL) {
2575 can_host_ips = true;
2578 if (!can_host_ips) {
2579 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2583 /* Do the IP reassignment calculations */
2584 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2586 /* Now tell all nodes to release any public IPs should not
2587 * host. This will be a NOOP on nodes that don't currently
2588 * hold the given IP.
2590 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2591 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2593 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2594 bool, nodemap->num);
2595 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2596 takeover_data->fail_callback = fail_callback;
2597 takeover_data->fail_callback_data = callback_data;
2598 takeover_data->nodemap = nodemap;
2600 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2601 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2603 async_data->fail_callback = takeover_run_fail_callback;
2604 async_data->callback_data = takeover_data;
2606 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2608 /* Send a RELEASE_IP to all nodes that should not be hosting
2609 * each IP. For each IP, all but one of these will be
2610 * redundant. However, the redundant ones are used to tell
2611 * nodes which node should be hosting the IP so that commands
2612 * like "ctdb ip" can display a particular nodes idea of who
2613 * is hosting what. */
2614 for (i=0;i<nodemap->num;i++) {
2615 /* don't talk to unconnected nodes, but do talk to banned nodes */
2616 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2620 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2621 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2622 /* This node should be serving this
2623 vnn so don't tell it to release the ip
2627 ip.pnn = tmp_ip->pnn;
2628 ip.addr = tmp_ip->addr;
2630 timeout = TAKEOVER_TIMEOUT();
2631 data.dsize = sizeof(ip);
2632 data.dptr = (uint8_t *)&ip;
2633 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2634 0, CTDB_CONTROL_RELEASE_IP, 0,
2637 if (state == NULL) {
2638 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2639 talloc_free(tmp_ctx);
2643 ctdb_client_async_add(async_data, state);
2646 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2647 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2648 talloc_free(tmp_ctx);
2651 talloc_free(async_data);
2654 /* For each IP, send a TAKOVER_IP to the node that should be
2655 * hosting it. Many of these will often be redundant (since
2656 * the allocation won't have changed) but they can be useful
2657 * to recover from inconsistencies. */
2658 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2659 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2661 async_data->fail_callback = fail_callback;
2662 async_data->callback_data = callback_data;
2664 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2665 if (tmp_ip->pnn == -1) {
2666 /* this IP won't be taken over */
2670 ip.pnn = tmp_ip->pnn;
2671 ip.addr = tmp_ip->addr;
2673 timeout = TAKEOVER_TIMEOUT();
2674 data.dsize = sizeof(ip);
2675 data.dptr = (uint8_t *)&ip;
2676 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2677 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2678 data, async_data, &timeout, NULL);
2679 if (state == NULL) {
2680 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2681 talloc_free(tmp_ctx);
2685 ctdb_client_async_add(async_data, state);
2687 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2688 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2689 talloc_free(tmp_ctx);
2695 * Tell all nodes to run eventscripts to process the
2696 * "ipreallocated" event. This can do a lot of things,
2697 * including restarting services to reconfigure them if public
2698 * IPs have moved. Once upon a time this event only used to
2701 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2702 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2703 iprealloc_data.retry_nodes = retry_data;
2704 iprealloc_data.retry_count = 0;
2705 iprealloc_data.fail_callback = fail_callback;
2706 iprealloc_data.fail_callback_data = callback_data;
2707 iprealloc_data.nodemap = nodemap;
2709 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2710 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2711 nodes, 0, TAKEOVER_TIMEOUT(),
2713 NULL, iprealloc_fail_callback,
2716 /* If the control failed then we should retry to any
2717 * nodes flagged by iprealloc_fail_callback using the
2718 * EVENTSCRIPT control. This is a best-effort at
2719 * backward compatiblity when running a mixed cluster
2720 * where some nodes have not yet been upgraded to
2721 * support the IPREALLOCATED control.
2723 DEBUG(DEBUG_WARNING,
2724 ("Retry ipreallocated to some nodes using eventscript control\n"));
2726 nodes = talloc_array(tmp_ctx, uint32_t,
2727 iprealloc_data.retry_count);
2728 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2731 for (i=0; i<nodemap->num; i++) {
2732 if (iprealloc_data.retry_nodes[i]) {
2738 data.dptr = discard_const("ipreallocated");
2739 data.dsize = strlen((char *)data.dptr) + 1;
2740 ret = ctdb_client_async_control(ctdb,
2741 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2742 nodes, 0, TAKEOVER_TIMEOUT(),
2744 NULL, fail_callback,
2747 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2751 talloc_free(tmp_ctx);
2757 destroy a ctdb_client_ip structure
2759 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2761 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2762 ctdb_addr_to_str(&ip->addr),
2763 ntohs(ip->addr.ip.sin_port),
2766 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2771 called by a client to inform us of a TCP connection that it is managing
2772 that should tickled with an ACK when IP takeover is done
2774 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2777 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2778 struct ctdb_connection *tcp_sock = NULL;
2779 struct ctdb_tcp_list *tcp;
2780 struct ctdb_connection t;
2783 struct ctdb_client_ip *ip;
2784 struct ctdb_vnn *vnn;
2785 ctdb_sock_addr addr;
2787 /* If we don't have public IPs, tickles are useless */
2788 if (ctdb->vnn == NULL) {
2792 tcp_sock = (struct ctdb_connection *)indata.dptr;
2794 addr = tcp_sock->src;
2795 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2796 addr = tcp_sock->dst;
2797 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2800 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2801 vnn = find_public_ip_vnn(ctdb, &addr);
2803 switch (addr.sa.sa_family) {
2805 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2806 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2807 ctdb_addr_to_str(&addr)));
2811 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2812 ctdb_addr_to_str(&addr)));
2815 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2821 if (vnn->pnn != ctdb->pnn) {
2822 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2823 ctdb_addr_to_str(&addr),
2824 client_id, client->pid));
2825 /* failing this call will tell smbd to die */
2829 ip = talloc(client, struct ctdb_client_ip);
2830 CTDB_NO_MEMORY(ctdb, ip);
2834 ip->client_id = client_id;
2835 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2836 DLIST_ADD(ctdb->client_ip_list, ip);
2838 tcp = talloc(client, struct ctdb_tcp_list);
2839 CTDB_NO_MEMORY(ctdb, tcp);
2841 tcp->connection.src = tcp_sock->src;
2842 tcp->connection.dst = tcp_sock->dst;
2844 DLIST_ADD(client->tcp_list, tcp);
2846 t.src = tcp_sock->src;
2847 t.dst = tcp_sock->dst;
2849 data.dptr = (uint8_t *)&t;
2850 data.dsize = sizeof(t);
2852 switch (addr.sa.sa_family) {
2854 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2855 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2856 ctdb_addr_to_str(&tcp_sock->src),
2857 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2860 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2861 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2862 ctdb_addr_to_str(&tcp_sock->src),
2863 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2866 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2870 /* tell all nodes about this tcp connection */
2871 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2872 CTDB_CONTROL_TCP_ADD,
2873 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2875 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2883 find a tcp address on a list
2885 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2886 struct ctdb_connection *tcp)
2890 if (array == NULL) {
2894 for (i=0;i<array->num;i++) {
2895 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2896 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2897 return &array->connections[i];
2906 called by a daemon to inform us of a TCP connection that one of its
2907 clients managing that should tickled with an ACK when IP takeover is
2910 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2912 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2913 struct ctdb_tcp_array *tcparray;
2914 struct ctdb_connection tcp;
2915 struct ctdb_vnn *vnn;
2917 /* If we don't have public IPs, tickles are useless */
2918 if (ctdb->vnn == NULL) {
2922 vnn = find_public_ip_vnn(ctdb, &p->dst);
2924 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2925 ctdb_addr_to_str(&p->dst)));
2931 tcparray = vnn->tcp_array;
2933 /* If this is the first tickle */
2934 if (tcparray == NULL) {
2935 tcparray = talloc(vnn, struct ctdb_tcp_array);
2936 CTDB_NO_MEMORY(ctdb, tcparray);
2937 vnn->tcp_array = tcparray;
2940 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2941 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2943 tcparray->connections[tcparray->num].src = p->src;
2944 tcparray->connections[tcparray->num].dst = p->dst;
2947 if (tcp_update_needed) {
2948 vnn->tcp_update_needed = true;
2954 /* Do we already have this tickle ?*/
2957 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2958 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2959 ctdb_addr_to_str(&tcp.dst),
2960 ntohs(tcp.dst.ip.sin_port),
2965 /* A new tickle, we must add it to the array */
2966 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2967 struct ctdb_connection,
2969 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2971 tcparray->connections[tcparray->num].src = p->src;
2972 tcparray->connections[tcparray->num].dst = p->dst;
2975 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2976 ctdb_addr_to_str(&tcp.dst),
2977 ntohs(tcp.dst.ip.sin_port),
2980 if (tcp_update_needed) {
2981 vnn->tcp_update_needed = true;
2989 called by a daemon to inform us of a TCP connection that one of its
2990 clients managing that should tickled with an ACK when IP takeover is
2993 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
2995 struct ctdb_connection *tcpp;
2996 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
2999 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3000 ctdb_addr_to_str(&conn->dst)));
3004 /* if the array is empty we cant remove it
3005 and we don't need to do anything
3007 if (vnn->tcp_array == NULL) {
3008 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3009 ctdb_addr_to_str(&conn->dst),
3010 ntohs(conn->dst.ip.sin_port)));
3015 /* See if we know this connection
3016 if we don't know this connection then we dont need to do anything
3018 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3020 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3021 ctdb_addr_to_str(&conn->dst),
3022 ntohs(conn->dst.ip.sin_port)));
3027 /* We need to remove this entry from the array.
3028 Instead of allocating a new array and copying data to it
3029 we cheat and just copy the last entry in the existing array
3030 to the entry that is to be removed and just shring the
3033 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3034 vnn->tcp_array->num--;
3036 /* If we deleted the last entry we also need to remove the entire array
3038 if (vnn->tcp_array->num == 0) {
3039 talloc_free(vnn->tcp_array);
3040 vnn->tcp_array = NULL;
3043 vnn->tcp_update_needed = true;
3045 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3046 ctdb_addr_to_str(&conn->src),
3047 ntohs(conn->src.ip.sin_port)));
3052 called by a daemon to inform us of a TCP connection that one of its
3053 clients used are no longer needed in the tickle database
3055 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3057 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3059 /* If we don't have public IPs, tickles are useless */
3060 if (ctdb->vnn == NULL) {
3064 ctdb_remove_connection(ctdb, conn);
3071 Called when another daemon starts - causes all tickles for all
3072 public addresses we are serving to be sent to the new node on the
3073 next check. This actually causes the next scheduled call to
3074 tdb_update_tcp_tickles() to update all nodes. This is simple and
3075 doesn't require careful error handling.
3077 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3079 struct ctdb_vnn *vnn;
3081 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3082 (unsigned long) pnn));
3084 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3085 vnn->tcp_update_needed = true;
3093 called when a client structure goes away - hook to remove
3094 elements from the tcp_list in all daemons
3096 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3098 while (client->tcp_list) {
3099 struct ctdb_tcp_list *tcp = client->tcp_list;
3100 DLIST_REMOVE(client->tcp_list, tcp);
3101 ctdb_remove_connection(client->ctdb, &tcp->connection);
3106 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3108 struct ctdb_vnn *vnn;
3111 if (ctdb->tunable.disable_ip_failover == 1) {
3115 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3116 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3117 ctdb_vnn_unassign_iface(ctdb, vnn);
3124 /* Don't allow multiple releases at once. Some code,
3125 * particularly ctdb_tickle_sentenced_connections() is
3127 if (vnn->update_in_flight) {
3128 DEBUG(DEBUG_WARNING,
3130 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3131 ctdb_addr_to_str(&vnn->public_address),
3132 vnn->public_netmask_bits,
3133 ctdb_vnn_iface_string(vnn)));
3136 vnn->update_in_flight = true;
3138 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3139 ctdb_addr_to_str(&vnn->public_address),
3140 vnn->public_netmask_bits,
3141 ctdb_vnn_iface_string(vnn)));
3143 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3144 ctdb_vnn_iface_string(vnn),
3145 ctdb_addr_to_str(&vnn->public_address),
3146 vnn->public_netmask_bits);
3147 release_kill_clients(ctdb, &vnn->public_address);
3148 ctdb_vnn_unassign_iface(ctdb, vnn);
3149 vnn->update_in_flight = false;
3153 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3158 get list of public IPs
3160 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3161 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3164 struct ctdb_public_ip_list_old *ips;
3165 struct ctdb_vnn *vnn;
3166 bool only_available = false;
3168 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3169 only_available = true;
3172 /* count how many public ip structures we have */
3174 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3178 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3179 num*sizeof(struct ctdb_public_ip);
3180 ips = talloc_zero_size(outdata, len);
3181 CTDB_NO_MEMORY(ctdb, ips);
3184 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3185 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3188 ips->ips[i].pnn = vnn->pnn;
3189 ips->ips[i].addr = vnn->public_address;
3193 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3194 i*sizeof(struct ctdb_public_ip);
3196 outdata->dsize = len;
3197 outdata->dptr = (uint8_t *)ips;
3203 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3204 struct ctdb_req_control_old *c,
3209 ctdb_sock_addr *addr;
3210 struct ctdb_public_ip_info_old *info;
3211 struct ctdb_vnn *vnn;
3213 addr = (ctdb_sock_addr *)indata.dptr;
3215 vnn = find_public_ip_vnn(ctdb, addr);
3217 /* if it is not a public ip it could be our 'single ip' */
3218 if (ctdb->single_ip_vnn) {
3219 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3220 vnn = ctdb->single_ip_vnn;
3225 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3226 "'%s'not a public address\n",
3227 ctdb_addr_to_str(addr)));
3231 /* count how many public ip structures we have */
3233 for (;vnn->ifaces[num];) {
3237 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3238 num*sizeof(struct ctdb_iface);
3239 info = talloc_zero_size(outdata, len);
3240 CTDB_NO_MEMORY(ctdb, info);
3242 info->ip.addr = vnn->public_address;
3243 info->ip.pnn = vnn->pnn;
3244 info->active_idx = 0xFFFFFFFF;
3246 for (i=0; vnn->ifaces[i]; i++) {
3247 struct ctdb_interface *cur;
3249 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3251 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3255 if (vnn->iface == cur) {
3256 info->active_idx = i;
3258 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3259 info->ifaces[i].link_state = cur->link_up;
3260 info->ifaces[i].references = cur->references;
3263 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3264 i*sizeof(struct ctdb_iface);
3266 outdata->dsize = len;
3267 outdata->dptr = (uint8_t *)info;
3272 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3273 struct ctdb_req_control_old *c,
3277 struct ctdb_iface_list_old *ifaces;
3278 struct ctdb_interface *cur;
3280 /* count how many public ip structures we have */
3282 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3286 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3287 num*sizeof(struct ctdb_iface);
3288 ifaces = talloc_zero_size(outdata, len);
3289 CTDB_NO_MEMORY(ctdb, ifaces);
3292 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3293 strcpy(ifaces->ifaces[i].name, cur->name);
3294 ifaces->ifaces[i].link_state = cur->link_up;
3295 ifaces->ifaces[i].references = cur->references;
3299 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3300 i*sizeof(struct ctdb_iface);
3302 outdata->dsize = len;
3303 outdata->dptr = (uint8_t *)ifaces;
3308 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3309 struct ctdb_req_control_old *c,
3312 struct ctdb_iface *info;
3313 struct ctdb_interface *iface;
3314 bool link_up = false;
3316 info = (struct ctdb_iface *)indata.dptr;
3318 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3319 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3320 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3321 len, len, info->name));
3325 switch (info->link_state) {
3333 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3334 (unsigned int)info->link_state));
3338 if (info->references != 0) {
3339 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3340 (unsigned int)info->references));
3344 iface = ctdb_find_iface(ctdb, info->name);
3345 if (iface == NULL) {
3349 if (link_up == iface->link_up) {
3353 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3354 ("iface[%s] has changed it's link status %s => %s\n",
3356 iface->link_up?"up":"down",
3357 link_up?"up":"down"));
3359 iface->link_up = link_up;
3365 structure containing the listening socket and the list of tcp connections
3366 that the ctdb daemon is to kill
3368 struct ctdb_kill_tcp {
3369 struct ctdb_vnn *vnn;
3370 struct ctdb_context *ctdb;
3372 struct tevent_fd *fde;
3373 trbt_tree_t *connections;
3378 a tcp connection that is to be killed
3380 struct ctdb_killtcp_con {
3381 ctdb_sock_addr src_addr;
3382 ctdb_sock_addr dst_addr;
3384 struct ctdb_kill_tcp *killtcp;
3387 /* this function is used to create a key to represent this socketpair
3388 in the killtcp tree.
3389 this key is used to insert and lookup matching socketpairs that are
3390 to be tickled and RST
3392 #define KILLTCP_KEYLEN 10
3393 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3395 static uint32_t key[KILLTCP_KEYLEN];
3397 bzero(key, sizeof(key));
3399 if (src->sa.sa_family != dst->sa.sa_family) {
3400 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3404 switch (src->sa.sa_family) {
3406 key[0] = dst->ip.sin_addr.s_addr;
3407 key[1] = src->ip.sin_addr.s_addr;
3408 key[2] = dst->ip.sin_port;
3409 key[3] = src->ip.sin_port;
3412 uint32_t *dst6_addr32 =
3413 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3414 uint32_t *src6_addr32 =
3415 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3416 key[0] = dst6_addr32[3];
3417 key[1] = src6_addr32[3];
3418 key[2] = dst6_addr32[2];
3419 key[3] = src6_addr32[2];
3420 key[4] = dst6_addr32[1];
3421 key[5] = src6_addr32[1];
3422 key[6] = dst6_addr32[0];
3423 key[7] = src6_addr32[0];
3424 key[8] = dst->ip6.sin6_port;
3425 key[9] = src->ip6.sin6_port;
3429 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3437 called when we get a read event on the raw socket
3439 static void capture_tcp_handler(struct tevent_context *ev,
3440 struct tevent_fd *fde,
3441 uint16_t flags, void *private_data)
3443 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3444 struct ctdb_killtcp_con *con;
3445 ctdb_sock_addr src, dst;
3446 uint32_t ack_seq, seq;
3448 if (!(flags & TEVENT_FD_READ)) {
3452 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3453 killtcp->private_data,
3455 &ack_seq, &seq) != 0) {
3456 /* probably a non-tcp ACK packet */
3460 /* check if we have this guy in our list of connections
3463 con = trbt_lookuparray32(killtcp->connections,
3464 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3466 /* no this was some other packet we can just ignore */
3470 /* This one has been tickled !
3471 now reset him and remove him from the list.
3473 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3474 ntohs(con->dst_addr.ip.sin_port),
3475 ctdb_addr_to_str(&con->src_addr),
3476 ntohs(con->src_addr.ip.sin_port)));
3478 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3483 /* when traversing the list of all tcp connections to send tickle acks to
3484 (so that we can capture the ack coming back and kill the connection
3486 this callback is called for each connection we are currently trying to kill
3488 static int tickle_connection_traverse(void *param, void *data)
3490 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3492 /* have tried too many times, just give up */
3493 if (con->count >= 5) {
3494 /* can't delete in traverse: reparent to delete_cons */
3495 talloc_steal(param, con);
3499 /* othervise, try tickling it again */
3502 (ctdb_sock_addr *)&con->dst_addr,
3503 (ctdb_sock_addr *)&con->src_addr,
3510 called every second until all sentenced connections have been reset
3512 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3513 struct tevent_timer *te,
3514 struct timeval t, void *private_data)
3516 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3517 void *delete_cons = talloc_new(NULL);
3519 /* loop over all connections sending tickle ACKs */
3520 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3522 /* now we've finished traverse, it's safe to do deletion. */
3523 talloc_free(delete_cons);
3525 /* If there are no more connections to kill we can remove the
3526 entire killtcp structure
3528 if ( (killtcp->connections == NULL) ||
3529 (killtcp->connections->root == NULL) ) {
3530 talloc_free(killtcp);
3534 /* try tickling them again in a seconds time
3536 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3537 timeval_current_ofs(1, 0),
3538 ctdb_tickle_sentenced_connections, killtcp);
3542 destroy the killtcp structure
3544 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3546 struct ctdb_vnn *tmpvnn;
3548 /* verify that this vnn is still active */
3549 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3550 if (tmpvnn == killtcp->vnn) {
3555 if (tmpvnn == NULL) {
3559 if (killtcp->vnn->killtcp != killtcp) {
3563 killtcp->vnn->killtcp = NULL;
3569 /* nothing fancy here, just unconditionally replace any existing
3570 connection structure with the new one.
3572 don't even free the old one if it did exist, that one is talloc_stolen
3573 by the same node in the tree anyway and will be deleted when the new data
3576 static void *add_killtcp_callback(void *parm, void *data)
3582 add a tcp socket to the list of connections we want to RST
3584 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3588 ctdb_sock_addr src, dst;
3589 struct ctdb_kill_tcp *killtcp;
3590 struct ctdb_killtcp_con *con;
3591 struct ctdb_vnn *vnn;
3593 ctdb_canonicalize_ip(s, &src);
3594 ctdb_canonicalize_ip(d, &dst);
3596 vnn = find_public_ip_vnn(ctdb, &dst);
3598 vnn = find_public_ip_vnn(ctdb, &src);
3601 /* if it is not a public ip it could be our 'single ip' */
3602 if (ctdb->single_ip_vnn) {
3603 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3604 vnn = ctdb->single_ip_vnn;
3609 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3613 killtcp = vnn->killtcp;
3615 /* If this is the first connection to kill we must allocate
3618 if (killtcp == NULL) {
3619 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3620 CTDB_NO_MEMORY(ctdb, killtcp);
3623 killtcp->ctdb = ctdb;
3624 killtcp->capture_fd = -1;
3625 killtcp->connections = trbt_create(killtcp, 0);
3627 vnn->killtcp = killtcp;
3628 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3633 /* create a structure that describes this connection we want to
3634 RST and store it in killtcp->connections
3636 con = talloc(killtcp, struct ctdb_killtcp_con);
3637 CTDB_NO_MEMORY(ctdb, con);
3638 con->src_addr = src;
3639 con->dst_addr = dst;
3641 con->killtcp = killtcp;
3644 trbt_insertarray32_callback(killtcp->connections,
3645 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3646 add_killtcp_callback, con);
3649 If we don't have a socket to listen on yet we must create it
3651 if (killtcp->capture_fd == -1) {
3652 const char *iface = ctdb_vnn_iface_string(vnn);
3653 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3654 if (killtcp->capture_fd == -1) {
3655 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3656 "socket on iface '%s' for killtcp (%s)\n",
3657 iface, strerror(errno)));
3663 if (killtcp->fde == NULL) {
3664 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3665 killtcp->capture_fd,
3667 capture_tcp_handler, killtcp);
3668 tevent_fd_set_auto_close(killtcp->fde);
3670 /* We also need to set up some events to tickle all these connections
3671 until they are all reset
3673 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3674 ctdb_tickle_sentenced_connections, killtcp);
3677 /* tickle him once now */
3686 talloc_free(vnn->killtcp);
3687 vnn->killtcp = NULL;
3692 kill a TCP connection.
3694 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3696 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3698 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3702 called by a daemon to inform us of the entire list of TCP tickles for
3703 a particular public address.
3704 this control should only be sent by the node that is currently serving
3705 that public address.
3707 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3709 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3710 struct ctdb_tcp_array *tcparray;
3711 struct ctdb_vnn *vnn;
3713 /* We must at least have tickles.num or else we cant verify the size
3714 of the received data blob
3716 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3717 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3721 /* verify that the size of data matches what we expect */
3722 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3723 + sizeof(struct ctdb_connection) * list->num) {
3724 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3728 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3729 ctdb_addr_to_str(&list->addr)));
3731 vnn = find_public_ip_vnn(ctdb, &list->addr);
3733 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3734 ctdb_addr_to_str(&list->addr)));
3739 /* remove any old ticklelist we might have */
3740 talloc_free(vnn->tcp_array);
3741 vnn->tcp_array = NULL;
3743 tcparray = talloc(vnn, struct ctdb_tcp_array);
3744 CTDB_NO_MEMORY(ctdb, tcparray);
3746 tcparray->num = list->num;
3748 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3749 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3751 memcpy(tcparray->connections, &list->connections[0],
3752 sizeof(struct ctdb_connection)*tcparray->num);
3754 /* We now have a new fresh tickle list array for this vnn */
3755 vnn->tcp_array = tcparray;
3761 called to return the full list of tickles for the puclic address associated
3762 with the provided vnn
3764 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3766 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3767 struct ctdb_tickle_list_old *list;
3768 struct ctdb_tcp_array *tcparray;
3770 struct ctdb_vnn *vnn;
3772 vnn = find_public_ip_vnn(ctdb, addr);
3774 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3775 ctdb_addr_to_str(addr)));
3780 tcparray = vnn->tcp_array;
3782 num = tcparray->num;
3787 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3788 + sizeof(struct ctdb_connection) * num;
3790 outdata->dptr = talloc_size(outdata, outdata->dsize);
3791 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3792 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3797 memcpy(&list->connections[0], tcparray->connections,
3798 sizeof(struct ctdb_connection) * num);
3806 set the list of all tcp tickles for a public address
3808 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3809 ctdb_sock_addr *addr,
3810 struct ctdb_tcp_array *tcparray)
3814 struct ctdb_tickle_list_old *list;
3817 num = tcparray->num;
3822 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3823 sizeof(struct ctdb_connection) * num;
3824 data.dptr = talloc_size(ctdb, data.dsize);
3825 CTDB_NO_MEMORY(ctdb, data.dptr);
3827 list = (struct ctdb_tickle_list_old *)data.dptr;
3831 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3834 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3835 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3836 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3838 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3842 talloc_free(data.dptr);
3849 perform tickle updates if required
3851 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3852 struct tevent_timer *te,
3853 struct timeval t, void *private_data)
3855 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3857 struct ctdb_vnn *vnn;
3859 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3860 /* we only send out updates for public addresses that
3863 if (ctdb->pnn != vnn->pnn) {
3866 /* We only send out the updates if we need to */
3867 if (!vnn->tcp_update_needed) {
3870 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3871 &vnn->public_address,
3874 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3875 ctdb_addr_to_str(&vnn->public_address)));
3878 ("Sent tickle update for public address %s\n",
3879 ctdb_addr_to_str(&vnn->public_address)));
3880 vnn->tcp_update_needed = false;
3884 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3885 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3886 ctdb_update_tcp_tickles, ctdb);
3890 start periodic update of tcp tickles
3892 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3894 ctdb->tickle_update_context = talloc_new(ctdb);
3896 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3897 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3898 ctdb_update_tcp_tickles, ctdb);
3904 struct control_gratious_arp {
3905 struct ctdb_context *ctdb;
3906 ctdb_sock_addr addr;
3912 send a control_gratuitous arp
3914 static void send_gratious_arp(struct tevent_context *ev,
3915 struct tevent_timer *te,
3916 struct timeval t, void *private_data)
3919 struct control_gratious_arp *arp = talloc_get_type(private_data,
3920 struct control_gratious_arp);
3922 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3924 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3925 arp->iface, strerror(errno)));
3930 if (arp->count == CTDB_ARP_REPEAT) {
3935 tevent_add_timer(arp->ctdb->ev, arp,
3936 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3937 send_gratious_arp, arp);
3944 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3946 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3947 struct control_gratious_arp *arp;
3949 /* verify the size of indata */
3950 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3951 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3952 (unsigned)indata.dsize,
3953 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3957 ( offsetof(struct ctdb_addr_info_old, iface)
3958 + gratious_arp->len ) ){
3960 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3961 "but should be %u bytes\n",
3962 (unsigned)indata.dsize,
3963 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3968 arp = talloc(ctdb, struct control_gratious_arp);
3969 CTDB_NO_MEMORY(ctdb, arp);
3972 arp->addr = gratious_arp->addr;
3973 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3974 CTDB_NO_MEMORY(ctdb, arp->iface);
3977 tevent_add_timer(arp->ctdb->ev, arp,
3978 timeval_zero(), send_gratious_arp, arp);
3983 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3985 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3988 /* verify the size of indata */
3989 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3990 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3994 ( offsetof(struct ctdb_addr_info_old, iface)
3997 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3998 "but should be %u bytes\n",
3999 (unsigned)indata.dsize,
4000 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4004 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4006 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4009 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4016 struct delete_ip_callback_state {
4017 struct ctdb_req_control_old *c;
4021 called when releaseip event finishes for del_public_address
4023 static void delete_ip_callback(struct ctdb_context *ctdb,
4024 int32_t status, TDB_DATA data,
4025 const char *errormsg,
4028 struct delete_ip_callback_state *state =
4029 talloc_get_type(private_data, struct delete_ip_callback_state);
4031 /* If release failed then fail. */
4032 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4033 talloc_free(private_data);
4036 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4037 struct ctdb_req_control_old *c,
4038 TDB_DATA indata, bool *async_reply)
4040 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4041 struct ctdb_vnn *vnn;
4043 /* verify the size of indata */
4044 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4045 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4049 ( offsetof(struct ctdb_addr_info_old, iface)
4052 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4053 "but should be %u bytes\n",
4054 (unsigned)indata.dsize,
4055 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4059 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4061 /* walk over all public addresses until we find a match */
4062 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4063 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4064 if (vnn->pnn == ctdb->pnn) {
4065 struct delete_ip_callback_state *state;
4066 struct ctdb_public_ip *ip;
4070 vnn->delete_pending = true;
4072 state = talloc(ctdb,
4073 struct delete_ip_callback_state);
4074 CTDB_NO_MEMORY(ctdb, state);
4077 ip = talloc(state, struct ctdb_public_ip);
4080 (__location__ " Out of memory\n"));
4085 ip->addr = pub->addr;
4087 data.dsize = sizeof(struct ctdb_public_ip);
4088 data.dptr = (unsigned char *)ip;
4090 ret = ctdb_daemon_send_control(ctdb,
4093 CTDB_CONTROL_RELEASE_IP,
4100 (__location__ "Unable to send "
4101 "CTDB_CONTROL_RELEASE_IP\n"));
4106 state->c = talloc_steal(state, c);
4107 *async_reply = true;
4109 /* This IP is not hosted on the
4110 * current node so just delete it
4112 do_delete_ip(ctdb, vnn);
4119 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4120 ctdb_addr_to_str(&pub->addr)));
4125 struct ipreallocated_callback_state {
4126 struct ctdb_req_control_old *c;
4129 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4130 int status, void *p)
4132 struct ipreallocated_callback_state *state =
4133 talloc_get_type(p, struct ipreallocated_callback_state);
4137 (" \"ipreallocated\" event script failed (status %d)\n",
4139 if (status == -ETIME) {
4140 ctdb_ban_self(ctdb);
4144 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4148 /* A control to run the ipreallocated event */
4149 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4150 struct ctdb_req_control_old *c,
4154 struct ipreallocated_callback_state *state;
4156 state = talloc(ctdb, struct ipreallocated_callback_state);
4157 CTDB_NO_MEMORY(ctdb, state);
4159 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4161 ret = ctdb_event_script_callback(ctdb, state,
4162 ctdb_ipreallocated_callback, state,
4163 CTDB_EVENT_IPREALLOCATED,
4167 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4172 /* tell the control that we will be reply asynchronously */
4173 state->c = talloc_steal(state, c);
4174 *async_reply = true;
4180 /* This function is called from the recovery daemon to verify that a remote
4181 node has the expected ip allocation.
4182 This is verified against ctdb->ip_tree
4184 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4185 struct ctdb_public_ip_list_old *ips,
4188 struct public_ip_list *tmp_ip;
4191 if (ctdb->ip_tree == NULL) {
4192 /* don't know the expected allocation yet, assume remote node
4201 for (i=0; i<ips->num; i++) {
4202 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4203 if (tmp_ip == NULL) {
4204 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4208 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4212 if (tmp_ip->pnn != ips->ips[i].pnn) {
4214 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4216 ctdb_addr_to_str(&ips->ips[i].addr),
4217 ips->ips[i].pnn, tmp_ip->pnn));
4225 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4227 struct public_ip_list *tmp_ip;
4229 /* IP tree is never built if DisableIPFailover is set */
4230 if (ctdb->tunable.disable_ip_failover != 0) {
4234 if (ctdb->ip_tree == NULL) {
4235 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4239 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4240 if (tmp_ip == NULL) {
4241 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4245 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4246 tmp_ip->pnn = ip->pnn;
4251 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4253 TALLOC_FREE(ctdb->ip_tree);
4256 struct ctdb_reloadips_handle {
4257 struct ctdb_context *ctdb;
4258 struct ctdb_req_control_old *c;
4262 struct tevent_fd *fde;
4265 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4267 if (h == h->ctdb->reload_ips) {
4268 h->ctdb->reload_ips = NULL;
4271 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4274 ctdb_kill(h->ctdb, h->child, SIGKILL);
4278 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4279 struct tevent_timer *te,
4280 struct timeval t, void *private_data)
4282 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4287 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4288 struct tevent_fd *fde,
4289 uint16_t flags, void *private_data)
4291 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4296 ret = sys_read(h->fd[0], &res, 1);
4297 if (ret < 1 || res != 0) {
4298 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4306 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4308 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4309 struct ctdb_public_ip_list_old *ips;
4310 struct ctdb_vnn *vnn;
4311 struct client_async_data *async_data;
4312 struct timeval timeout;
4314 struct ctdb_client_control_state *state;
4318 CTDB_NO_MEMORY(ctdb, mem_ctx);
4320 /* Read IPs from local node */
4321 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4322 CTDB_CURRENT_NODE, mem_ctx, &ips);
4325 ("Unable to fetch public IPs from local node\n"));
4326 talloc_free(mem_ctx);
4330 /* Read IPs file - this is safe since this is a child process */
4332 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4333 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4334 talloc_free(mem_ctx);
4338 async_data = talloc_zero(mem_ctx, struct client_async_data);
4339 CTDB_NO_MEMORY(ctdb, async_data);
4341 /* Compare IPs between node and file for IPs to be deleted */
4342 for (i = 0; i < ips->num; i++) {
4344 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4345 if (ctdb_same_ip(&vnn->public_address,
4346 &ips->ips[i].addr)) {
4347 /* IP is still in file */
4353 /* Delete IP ips->ips[i] */
4354 struct ctdb_addr_info_old *pub;
4357 ("IP %s no longer configured, deleting it\n",
4358 ctdb_addr_to_str(&ips->ips[i].addr)));
4360 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4361 CTDB_NO_MEMORY(ctdb, pub);
4363 pub->addr = ips->ips[i].addr;
4367 timeout = TAKEOVER_TIMEOUT();
4369 data.dsize = offsetof(struct ctdb_addr_info_old,
4371 data.dptr = (uint8_t *)pub;
4373 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4374 CTDB_CONTROL_DEL_PUBLIC_IP,
4375 0, data, async_data,
4377 if (state == NULL) {
4380 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4384 ctdb_client_async_add(async_data, state);
4388 /* Compare IPs between node and file for IPs to be added */
4390 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4391 for (i = 0; i < ips->num; i++) {
4392 if (ctdb_same_ip(&vnn->public_address,
4393 &ips->ips[i].addr)) {
4394 /* IP already on node */
4398 if (i == ips->num) {
4399 /* Add IP ips->ips[i] */
4400 struct ctdb_addr_info_old *pub;
4401 const char *ifaces = NULL;
4406 ("New IP %s configured, adding it\n",
4407 ctdb_addr_to_str(&vnn->public_address)));
4409 uint32_t pnn = ctdb_get_pnn(ctdb);
4411 data.dsize = sizeof(pnn);
4412 data.dptr = (uint8_t *)&pnn;
4414 ret = ctdb_client_send_message(
4416 CTDB_BROADCAST_CONNECTED,
4417 CTDB_SRVID_REBALANCE_NODE,
4420 DEBUG(DEBUG_WARNING,
4421 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4427 ifaces = vnn->ifaces[0];
4429 while (vnn->ifaces[iface] != NULL) {
4430 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4431 vnn->ifaces[iface]);
4435 len = strlen(ifaces) + 1;
4436 pub = talloc_zero_size(mem_ctx,
4437 offsetof(struct ctdb_addr_info_old, iface) + len);
4438 CTDB_NO_MEMORY(ctdb, pub);
4440 pub->addr = vnn->public_address;
4441 pub->mask = vnn->public_netmask_bits;
4443 memcpy(&pub->iface[0], ifaces, pub->len);
4445 timeout = TAKEOVER_TIMEOUT();
4447 data.dsize = offsetof(struct ctdb_addr_info_old,
4449 data.dptr = (uint8_t *)pub;
4451 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4452 CTDB_CONTROL_ADD_PUBLIC_IP,
4453 0, data, async_data,
4455 if (state == NULL) {
4458 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4462 ctdb_client_async_add(async_data, state);
4466 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4467 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4471 talloc_free(mem_ctx);
4475 talloc_free(mem_ctx);
4479 /* This control is sent to force the node to re-read the public addresses file
4480 and drop any addresses we should nnot longer host, and add new addresses
4481 that we are now able to host
4483 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4485 struct ctdb_reloadips_handle *h;
4486 pid_t parent = getpid();
4488 if (ctdb->reload_ips != NULL) {
4489 talloc_free(ctdb->reload_ips);
4490 ctdb->reload_ips = NULL;
4493 h = talloc(ctdb, struct ctdb_reloadips_handle);
4494 CTDB_NO_MEMORY(ctdb, h);
4499 if (pipe(h->fd) == -1) {
4500 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4505 h->child = ctdb_fork(ctdb);
4506 if (h->child == (pid_t)-1) {
4507 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4515 if (h->child == 0) {
4516 signed char res = 0;
4519 debug_extra = talloc_asprintf(NULL, "reloadips:");
4521 prctl_set_comment("ctdb_reloadips");
4522 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4523 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4526 res = ctdb_reloadips_child(ctdb);
4528 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4532 sys_write(h->fd[1], &res, 1);
4533 /* make sure we die when our parent dies */
4534 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4540 h->c = talloc_steal(h, c);
4543 set_close_on_exec(h->fd[0]);
4545 talloc_set_destructor(h, ctdb_reloadips_destructor);
4548 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4549 ctdb_reloadips_child_handler, (void *)h);
4550 tevent_fd_set_auto_close(h->fde);
4552 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4553 ctdb_reloadips_timeout_event, h);
4555 /* we reply later */
4556 *async_reply = true;