4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 enum ipalloc_algorithm {
57 IPALLOC_DETERMINISTIC,
58 IPALLOC_NONDETERMINISTIC,
62 struct ipalloc_state {
65 /* Arrays with data for each node */
66 struct ctdb_public_ip_list_old **known_public_ips;
67 struct ctdb_public_ip_list_old **available_public_ips;
69 enum ipalloc_algorithm algorithm;
70 uint32_t no_ip_failback;
73 struct ctdb_interface {
74 struct ctdb_interface *prev, *next;
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
83 return vnn->iface->name;
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 struct ctdb_interface *i;
93 /* Verify that we don't have an entry for this ip yet */
94 for (i=ctdb->ifaces;i;i=i->next) {
95 if (strcmp(i->name, iface) == 0) {
100 /* create a new structure for this interface */
101 i = talloc_zero(ctdb, struct ctdb_interface);
102 CTDB_NO_MEMORY_FATAL(ctdb, i);
103 i->name = talloc_strdup(i, iface);
104 CTDB_NO_MEMORY(ctdb, i->name);
108 DLIST_ADD(ctdb->ifaces, i);
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
118 for (n = 0; vnn->ifaces[n] != NULL; n++) {
119 if (strcmp(name, vnn->ifaces[n]) == 0) {
127 /* If any interfaces now have no possible IPs then delete them. This
128 * implementation is naive (i.e. simple) rather than clever
129 * (i.e. complex). Given that this is run on delip and that operation
130 * is rare, this doesn't need to be efficient - it needs to be
131 * foolproof. One alternative is reference counting, where the logic
132 * is distributed and can, therefore, be broken in multiple places.
133 * Another alternative is to build a red-black tree of interfaces that
134 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135 * once) and then walking ctdb->ifaces once and deleting those not in
136 * the tree. Let's go to one of those if the naive implementation
137 * causes problems... :-)
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140 struct ctdb_vnn *vnn)
142 struct ctdb_interface *i, *next;
144 /* For each interface, check if there's an IP using it. */
145 for (i = ctdb->ifaces; i != NULL; i = next) {
150 /* Only consider interfaces named in the given VNN. */
151 if (!vnn_has_interface_with_name(vnn, i->name)) {
155 /* Is the "single IP" on this interface? */
156 if ((ctdb->single_ip_vnn != NULL) &&
157 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159 /* Found, next interface please... */
162 /* Search for a vnn with this interface. */
164 for (tv=ctdb->vnn; tv; tv=tv->next) {
165 if (vnn_has_interface_with_name(tv, i->name)) {
172 /* None of the VNNs are using this interface. */
173 DLIST_REMOVE(ctdb->ifaces, i);
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
183 struct ctdb_interface *i;
185 for (i=ctdb->ifaces;i;i=i->next) {
186 if (strcmp(i->name, iface) == 0) {
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195 struct ctdb_vnn *vnn)
198 struct ctdb_interface *cur = NULL;
199 struct ctdb_interface *best = NULL;
201 for (i=0; vnn->ifaces[i]; i++) {
203 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
217 if (cur->references < best->references) {
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227 struct ctdb_vnn *vnn)
229 struct ctdb_interface *best = NULL;
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "still assigned to iface '%s'\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn)));
239 best = ctdb_vnn_best_iface(ctdb, vnn);
241 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242 "cannot assign to iface any iface\n",
243 ctdb_addr_to_str(&vnn->public_address)));
249 vnn->pnn = ctdb->pnn;
251 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252 "now assigned to iface '%s' refs[%d]\n",
253 ctdb_addr_to_str(&vnn->public_address),
254 ctdb_vnn_iface_string(vnn),
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
262 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263 "now unassigned (old iface '%s' refs[%d])\n",
264 ctdb_addr_to_str(&vnn->public_address),
265 ctdb_vnn_iface_string(vnn),
266 vnn->iface?vnn->iface->references:0));
268 vnn->iface->references--;
271 if (vnn->pnn == ctdb->pnn) {
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
281 /* Nodes that are not RUNNING can not host IPs */
282 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
286 if (vnn->delete_pending) {
290 if (vnn->iface && vnn->iface->link_up) {
294 for (i=0; vnn->ifaces[i]; i++) {
295 struct ctdb_interface *cur;
297 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
310 struct ctdb_takeover_arp {
311 struct ctdb_context *ctdb;
314 struct ctdb_tcp_array *tcparray;
315 struct ctdb_vnn *vnn;
320 lists of tcp endpoints
322 struct ctdb_tcp_list {
323 struct ctdb_tcp_list *prev, *next;
324 struct ctdb_connection connection;
328 list of clients to kill on IP release
330 struct ctdb_client_ip {
331 struct ctdb_client_ip *prev, *next;
332 struct ctdb_context *ctdb;
339 send a gratuitous arp
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342 struct tevent_timer *te,
343 struct timeval t, void *private_data)
345 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
346 struct ctdb_takeover_arp);
348 struct ctdb_tcp_array *tcparray;
349 const char *iface = ctdb_vnn_iface_string(arp->vnn);
351 ret = ctdb_sys_send_arp(&arp->addr, iface);
353 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354 iface, strerror(errno)));
357 tcparray = arp->tcparray;
359 for (i=0;i<tcparray->num;i++) {
360 struct ctdb_connection *tcon;
362 tcon = &tcparray->connections[i];
363 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364 (unsigned)ntohs(tcon->dst.ip.sin_port),
365 ctdb_addr_to_str(&tcon->src),
366 (unsigned)ntohs(tcon->src.ip.sin_port)));
367 ret = ctdb_sys_send_tcp(
372 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373 ctdb_addr_to_str(&tcon->src)));
380 if (arp->count == CTDB_ARP_REPEAT) {
385 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387 ctdb_control_send_arp, arp);
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391 struct ctdb_vnn *vnn)
393 struct ctdb_takeover_arp *arp;
394 struct ctdb_tcp_array *tcparray;
396 if (!vnn->takeover_ctx) {
397 vnn->takeover_ctx = talloc_new(vnn);
398 if (!vnn->takeover_ctx) {
403 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
409 arp->addr = vnn->public_address;
412 tcparray = vnn->tcp_array;
414 /* add all of the known tcp connections for this IP to the
415 list of tcp connections to send tickle acks for */
416 arp->tcparray = talloc_steal(arp, tcparray);
418 vnn->tcp_array = NULL;
419 vnn->tcp_update_needed = true;
422 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423 timeval_zero(), ctdb_control_send_arp, arp);
428 struct takeover_callback_state {
429 struct ctdb_req_control_old *c;
430 ctdb_sock_addr *addr;
431 struct ctdb_vnn *vnn;
434 struct ctdb_do_takeip_state {
435 struct ctdb_req_control_old *c;
436 struct ctdb_vnn *vnn;
440 called when takeip event finishes
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
445 struct ctdb_do_takeip_state *state =
446 talloc_get_type(private_data, struct ctdb_do_takeip_state);
451 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453 if (status == -ETIME) {
456 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457 ctdb_addr_to_str(&state->vnn->public_address),
458 ctdb_vnn_iface_string(state->vnn)));
459 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461 node->flags |= NODE_FLAGS_UNHEALTHY;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
539 ctdb_do_takeip_callback,
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
575 if (status == -ETIME) {
578 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579 ctdb_addr_to_str(&state->vnn->public_address),
581 ctdb_vnn_iface_string(state->vnn)));
584 * All we can do is reset the old interface
585 * and let the next run fix it
587 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588 state->vnn->iface = state->old;
589 state->vnn->iface->references++;
591 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
596 if (ctdb->do_checkpublicip) {
598 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
607 /* the control succeeded */
608 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 state->vnn->update_in_flight = false;
620 update (move) an ip address
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623 struct ctdb_req_control_old *c,
624 struct ctdb_vnn *vnn)
627 struct ctdb_do_updateip_state *state;
628 struct ctdb_interface *old = vnn->iface;
629 const char *new_name;
631 if (vnn->update_in_flight) {
632 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633 "update for this IP already in flight\n",
634 ctdb_addr_to_str(&vnn->public_address),
635 vnn->public_netmask_bits));
639 ctdb_vnn_unassign_iface(ctdb, vnn);
640 ret = ctdb_vnn_assign_iface(ctdb, vnn);
642 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643 "assin a usable interface (old iface '%s')\n",
644 ctdb_addr_to_str(&vnn->public_address),
645 vnn->public_netmask_bits,
650 new_name = ctdb_vnn_iface_string(vnn);
651 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652 /* A benign update from one interface onto itself.
653 * no need to run the eventscripts in this case, just return
656 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 state = talloc(vnn, struct ctdb_do_updateip_state);
661 CTDB_NO_MEMORY(ctdb, state);
663 state->c = talloc_steal(ctdb, c);
667 vnn->update_in_flight = true;
668 talloc_set_destructor(state, ctdb_updateip_destructor);
670 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671 "interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 vnn->public_netmask_bits,
677 ret = ctdb_event_script_callback(ctdb,
679 ctdb_do_updateip_callback,
681 CTDB_EVENT_UPDATE_IP,
685 ctdb_addr_to_str(&vnn->public_address),
686 vnn->public_netmask_bits);
688 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689 ctdb_addr_to_str(&vnn->public_address),
690 old->name, new_name));
699 Find the vnn of the node that has a public ip address
700 returns -1 if the address is not known as a public address
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 struct ctdb_vnn *vnn;
706 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707 if (ctdb_same_ip(&vnn->public_address, addr)) {
716 take over an ip address
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719 struct ctdb_req_control_old *c,
724 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725 struct ctdb_vnn *vnn;
726 bool have_ip = false;
727 bool do_updateip = false;
728 bool do_takeip = false;
729 struct ctdb_interface *best_iface = NULL;
731 if (pip->pnn != ctdb->pnn) {
732 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733 "with pnn %d, but we're node %d\n",
734 ctdb_addr_to_str(&pip->addr),
735 pip->pnn, ctdb->pnn));
739 /* update out vnn list */
740 vnn = find_public_ip_vnn(ctdb, &pip->addr);
742 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743 ctdb_addr_to_str(&pip->addr)));
747 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748 have_ip = ctdb_sys_have_ip(&pip->addr);
750 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751 if (best_iface == NULL) {
752 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753 "a usable interface (old %s, have_ip %d)\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 vnn->public_netmask_bits,
756 ctdb_vnn_iface_string(vnn),
761 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
767 if (vnn->iface == NULL && have_ip) {
768 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770 ctdb_addr_to_str(&vnn->public_address)));
774 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776 "and we have it on iface[%s], but it was assigned to node %d"
777 "and we are node %d, banning ourself\n",
778 ctdb_addr_to_str(&vnn->public_address),
779 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
784 if (vnn->pnn == -1 && have_ip) {
785 vnn->pnn = ctdb->pnn;
786 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787 "and we already have it on iface[%s], update local daemon\n",
788 ctdb_addr_to_str(&vnn->public_address),
789 ctdb_vnn_iface_string(vnn)));
794 if (vnn->iface != best_iface) {
795 if (!vnn->iface->link_up) {
797 } else if (vnn->iface->references > (best_iface->references + 1)) {
798 /* only move when the rebalance gains something */
806 ctdb_vnn_unassign_iface(ctdb, vnn);
813 ret = ctdb_do_takeip(ctdb, c, vnn);
817 } else if (do_updateip) {
818 ret = ctdb_do_updateip(ctdb, c, vnn);
824 * The interface is up and the kernel known the ip
827 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828 ctdb_addr_to_str(&pip->addr),
829 vnn->public_netmask_bits,
830 ctdb_vnn_iface_string(vnn)));
834 /* tell ctdb_control.c that we will be replying asynchronously */
841 kill any clients that are registered with a IP that is being released
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 struct ctdb_client_ip *ip;
847 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848 ctdb_addr_to_str(addr)));
850 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851 ctdb_sock_addr tmp_addr;
854 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
856 ctdb_addr_to_str(&ip->addr)));
858 if (ctdb_same_ip(&tmp_addr, addr)) {
859 struct ctdb_client *client = reqid_find(ctdb->idr,
862 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
864 ctdb_addr_to_str(&ip->addr),
867 if (client->pid != 0) {
868 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869 (unsigned)client->pid,
870 ctdb_addr_to_str(addr),
872 kill(client->pid, SIGKILL);
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 DLIST_REMOVE(ctdb->vnn, vnn);
881 ctdb_vnn_unassign_iface(ctdb, vnn);
882 ctdb_remove_orphaned_ifaces(ctdb, vnn);
887 called when releaseip event finishes
889 static void release_ip_callback(struct ctdb_context *ctdb, int status,
892 struct takeover_callback_state *state =
893 talloc_get_type(private_data, struct takeover_callback_state);
896 if (status == -ETIME) {
900 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901 if (ctdb_sys_have_ip(state->addr)) {
903 ("IP %s still hosted during release IP callback, failing\n",
904 ctdb_addr_to_str(state->addr)));
905 ctdb_request_control_reply(ctdb, state->c,
912 /* send a message to all clients of this node telling them
913 that the cluster has been reconfigured and they should
914 release any sockets on this IP */
915 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917 data.dsize = strlen((char *)data.dptr)+1;
919 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923 /* kill clients that have registered with this IP */
924 release_kill_clients(ctdb, state->addr);
926 ctdb_vnn_unassign_iface(ctdb, state->vnn);
928 /* Process the IP if it has been marked for deletion */
929 if (state->vnn->delete_pending) {
930 do_delete_ip(ctdb, state->vnn);
934 /* the control succeeded */
935 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 if (state->vnn != NULL) {
942 state->vnn->update_in_flight = false;
948 release an ip address
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
951 struct ctdb_req_control_old *c,
956 struct takeover_callback_state *state;
957 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958 struct ctdb_vnn *vnn;
961 /* update our vnn list */
962 vnn = find_public_ip_vnn(ctdb, &pip->addr);
964 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965 ctdb_addr_to_str(&pip->addr)));
970 /* stop any previous arps */
971 talloc_free(vnn->takeover_ctx);
972 vnn->takeover_ctx = NULL;
974 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975 * lazy multicast to drop an IP from any node that isn't the
976 * intended new node. The following causes makes ctdbd ignore
977 * a release for any address it doesn't host.
979 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980 if (!ctdb_sys_have_ip(&pip->addr)) {
981 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982 ctdb_addr_to_str(&pip->addr),
983 vnn->public_netmask_bits,
984 ctdb_vnn_iface_string(vnn)));
985 ctdb_vnn_unassign_iface(ctdb, vnn);
989 if (vnn->iface == NULL) {
990 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991 ctdb_addr_to_str(&pip->addr),
992 vnn->public_netmask_bits));
997 /* There is a potential race between take_ip and us because we
998 * update the VNN via a callback that run when the
999 * eventscripts have been run. Avoid the race by allowing one
1000 * update to be in flight at a time.
1002 if (vnn->update_in_flight) {
1003 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004 "update for this IP already in flight\n",
1005 ctdb_addr_to_str(&vnn->public_address),
1006 vnn->public_netmask_bits));
1010 iface = strdup(ctdb_vnn_iface_string(vnn));
1012 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits,
1018 state = talloc(ctdb, struct takeover_callback_state);
1019 if (state == NULL) {
1020 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021 __FILE__, __LINE__);
1026 state->c = talloc_steal(state, c);
1027 state->addr = talloc(state, ctdb_sock_addr);
1028 if (state->addr == NULL) {
1029 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030 __FILE__, __LINE__);
1035 *state->addr = pip->addr;
1038 vnn->update_in_flight = true;
1039 talloc_set_destructor(state, ctdb_releaseip_destructor);
1041 ret = ctdb_event_script_callback(ctdb,
1042 state, release_ip_callback, state,
1043 CTDB_EVENT_RELEASE_IP,
1046 ctdb_addr_to_str(&pip->addr),
1047 vnn->public_netmask_bits);
1050 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051 ctdb_addr_to_str(&pip->addr),
1052 ctdb_vnn_iface_string(vnn)));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063 ctdb_sock_addr *addr,
1064 unsigned mask, const char *ifaces,
1067 struct ctdb_vnn *vnn;
1074 tmp = strdup(ifaces);
1075 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076 if (!ctdb_sys_check_iface_exists(iface)) {
1077 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1084 /* Verify that we don't have an entry for this ip yet */
1085 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1088 ctdb_addr_to_str(addr)));
1093 /* create a new vnn structure for this ip address */
1094 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097 tmp = talloc_strdup(vnn, ifaces);
1098 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1107 vnn->ifaces[num] = NULL;
1108 vnn->public_address = *addr;
1109 vnn->public_netmask_bits = mask;
1111 if (check_address) {
1112 if (ctdb_sys_have_ip(addr)) {
1113 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114 vnn->pnn = ctdb->pnn;
1118 for (i=0; vnn->ifaces[i]; i++) {
1119 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122 "for public_address[%s]\n",
1123 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1129 DLIST_ADD(ctdb->vnn, vnn);
1135 setup the public address lists from a file
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1143 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144 if (lines == NULL) {
1145 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1148 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1152 for (i=0;i<nlines;i++) {
1154 ctdb_sock_addr addr;
1155 const char *addrstr;
1160 while ((*line == ' ') || (*line == '\t')) {
1166 if (strcmp(line, "") == 0) {
1169 tok = strtok(line, " \t");
1171 tok = strtok(NULL, " \t");
1173 if (NULL == ctdb->default_public_interface) {
1174 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1179 ifaces = ctdb->default_public_interface;
1184 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1189 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1205 struct ctdb_vnn *svnn;
1206 struct ctdb_interface *cur = NULL;
1210 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211 CTDB_NO_MEMORY(ctdb, svnn);
1213 svnn->ifaces = talloc_array(svnn, const char *, 2);
1214 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217 svnn->ifaces[1] = NULL;
1219 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1225 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228 "for single_ip[%s]\n",
1230 ctdb_addr_to_str(&svnn->public_address)));
1235 /* assume the single public ip interface is initially "good" */
1236 cur = ctdb_find_iface(ctdb, iface);
1238 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1241 cur->link_up = true;
1243 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1249 ctdb->single_ip_vnn = svnn;
1253 struct public_ip_list {
1254 struct public_ip_list *next;
1256 ctdb_sock_addr addr;
1259 /* Given a physical node, return the number of
1260 public addresses that is currently assigned to this node.
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1266 for (;ips;ips=ips->next) {
1267 if (ips->pnn == pnn) {
1275 /* Can the given node host the given IP: is the public IP known to the
1276 * node and is NOIPHOST unset?
1278 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1279 struct ctdb_ipflags ipflags,
1280 struct public_ip_list *ip)
1282 struct ctdb_public_ip_list_old *public_ips;
1285 if (ipflags.noiphost) {
1289 public_ips = ctdb->ipalloc_state->available_public_ips[pnn];
1291 if (public_ips == NULL) {
1295 for (i=0; i<public_ips->num; i++) {
1296 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1297 /* yes, this node can serve this public ip */
1305 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1306 struct ctdb_ipflags ipflags,
1307 struct public_ip_list *ip)
1309 if (ipflags.noiptakeover) {
1313 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1316 /* search the node lists list for a node to takeover this ip.
1317 pick the node that currently are serving the least number of ips
1318 so that the ips get spread out evenly.
1320 static int find_takeover_node(struct ctdb_context *ctdb,
1321 struct ctdb_ipflags *ipflags,
1322 struct public_ip_list *ip,
1323 struct public_ip_list *all_ips)
1325 int pnn, min=0, num;
1328 numnodes = talloc_array_length(ipflags);
1330 for (i=0; i<numnodes; i++) {
1331 /* verify that this node can serve this ip */
1332 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1333 /* no it couldnt so skip to the next node */
1337 num = node_ip_coverage(i, all_ips);
1338 /* was this the first node we checked ? */
1350 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1351 ctdb_addr_to_str(&ip->addr)));
1361 static uint32_t *ip_key(ctdb_sock_addr *ip)
1363 static uint32_t key[IP_KEYLEN];
1365 bzero(key, sizeof(key));
1367 switch (ip->sa.sa_family) {
1369 key[3] = htonl(ip->ip.sin_addr.s_addr);
1372 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1373 key[0] = htonl(s6_a32[0]);
1374 key[1] = htonl(s6_a32[1]);
1375 key[2] = htonl(s6_a32[2]);
1376 key[3] = htonl(s6_a32[3]);
1380 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1387 static void *add_ip_callback(void *parm, void *data)
1389 struct public_ip_list *this_ip = parm;
1390 struct public_ip_list *prev_ip = data;
1392 if (prev_ip == NULL) {
1395 if (this_ip->pnn == -1) {
1396 this_ip->pnn = prev_ip->pnn;
1402 static int getips_count_callback(void *param, void *data)
1404 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1405 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1407 new_ip->next = *ip_list;
1412 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1413 struct ctdb_public_ip_list_old *ips,
1416 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1417 struct ipalloc_state *ipalloc_state,
1418 struct ctdb_node_map_old *nodemap)
1423 if (ipalloc_state->num != nodemap->num) {
1426 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1427 ipalloc_state->num, nodemap->num));
1431 for (j=0; j<nodemap->num; j++) {
1432 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1436 /* Retrieve the list of known public IPs from the node */
1437 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1442 &ipalloc_state->known_public_ips[j]);
1445 ("Failed to read known public IPs from node: %u\n",
1450 if (ctdb->do_checkpublicip) {
1451 verify_remote_ip_allocation(ctdb,
1452 ipalloc_state->known_public_ips[j],
1456 /* Retrieve the list of available public IPs from the node */
1457 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1461 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1462 &ipalloc_state->available_public_ips[j]);
1465 ("Failed to read available public IPs from node: %u\n",
1474 static struct public_ip_list *
1475 create_merged_ip_list(struct ctdb_context *ctdb)
1478 struct public_ip_list *ip_list;
1479 struct ctdb_public_ip_list_old *public_ips;
1481 if (ctdb->ip_tree != NULL) {
1482 talloc_free(ctdb->ip_tree);
1483 ctdb->ip_tree = NULL;
1485 ctdb->ip_tree = trbt_create(ctdb, 0);
1487 for (i=0;i<ctdb->num_nodes;i++) {
1488 public_ips = ctdb->ipalloc_state->known_public_ips[i];
1490 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1494 /* there were no public ips for this node */
1495 if (public_ips == NULL) {
1499 for (j=0;j<public_ips->num;j++) {
1500 struct public_ip_list *tmp_ip;
1502 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1503 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1504 /* Do not use information about IP addresses hosted
1505 * on other nodes, it may not be accurate */
1506 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1507 tmp_ip->pnn = public_ips->ips[j].pnn;
1511 tmp_ip->addr = public_ips->ips[j].addr;
1512 tmp_ip->next = NULL;
1514 trbt_insertarray32_callback(ctdb->ip_tree,
1515 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1522 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1528 * This is the length of the longtest common prefix between the IPs.
1529 * It is calculated by XOR-ing the 2 IPs together and counting the
1530 * number of leading zeroes. The implementation means that all
1531 * addresses end up being 128 bits long.
1533 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1534 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1535 * lots of nodes and IP addresses?
1537 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1539 uint32_t ip1_k[IP_KEYLEN];
1544 uint32_t distance = 0;
1546 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1548 for (i=0; i<IP_KEYLEN; i++) {
1549 x = ip1_k[i] ^ t[i];
1553 /* Count number of leading zeroes.
1554 * FIXME? This could be optimised...
1556 while ((x & (1 << 31)) == 0) {
1566 /* Calculate the IP distance for the given IP relative to IPs on the
1567 given node. The ips argument is generally the all_ips variable
1568 used in the main part of the algorithm.
1570 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1571 struct public_ip_list *ips,
1574 struct public_ip_list *t;
1579 for (t=ips; t != NULL; t=t->next) {
1580 if (t->pnn != pnn) {
1584 /* Optimisation: We never calculate the distance
1585 * between an address and itself. This allows us to
1586 * calculate the effect of removing an address from a
1587 * node by simply calculating the distance between
1588 * that address and all of the exitsing addresses.
1589 * Moreover, we assume that we're only ever dealing
1590 * with addresses from all_ips so we can identify an
1591 * address via a pointer rather than doing a more
1592 * expensive address comparison. */
1593 if (&(t->addr) == ip) {
1597 d = ip_distance(ip, &(t->addr));
1598 sum += d * d; /* Cheaper than pulling in math.h :-) */
1604 /* Return the LCP2 imbalance metric for addresses currently assigned
1607 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1609 struct public_ip_list *t;
1611 uint32_t imbalance = 0;
1613 for (t=all_ips; t!=NULL; t=t->next) {
1614 if (t->pnn != pnn) {
1617 /* Pass the rest of the IPs rather than the whole
1620 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1626 /* Allocate any unassigned IPs just by looping through the IPs and
1627 * finding the best node for each.
1629 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1630 struct ctdb_ipflags *ipflags,
1631 struct public_ip_list *all_ips)
1633 struct public_ip_list *tmp_ip;
1635 /* loop over all ip's and find a physical node to cover for
1638 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1639 if (tmp_ip->pnn == -1) {
1640 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1641 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1642 ctdb_addr_to_str(&tmp_ip->addr)));
1648 /* Basic non-deterministic rebalancing algorithm.
1650 static void basic_failback(struct ctdb_context *ctdb,
1651 struct ctdb_ipflags *ipflags,
1652 struct public_ip_list *all_ips,
1656 int maxnode, maxnum, minnode, minnum, num, retries;
1657 struct public_ip_list *tmp_ip;
1659 numnodes = talloc_array_length(ipflags);
1666 /* for each ip address, loop over all nodes that can serve
1667 this ip and make sure that the difference between the node
1668 serving the most and the node serving the least ip's are
1671 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672 if (tmp_ip->pnn == -1) {
1676 /* Get the highest and lowest number of ips's served by any
1677 valid node which can serve this ip.
1681 for (i=0; i<numnodes; i++) {
1682 /* only check nodes that can actually serve this ip */
1683 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1684 /* no it couldnt so skip to the next node */
1688 num = node_ip_coverage(i, all_ips);
1689 if (maxnode == -1) {
1698 if (minnode == -1) {
1708 if (maxnode == -1) {
1709 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1710 ctdb_addr_to_str(&tmp_ip->addr)));
1715 /* if the spread between the smallest and largest coverage by
1716 a node is >=2 we steal one of the ips from the node with
1717 most coverage to even things out a bit.
1718 try to do this a limited number of times since we dont
1719 want to spend too much time balancing the ip coverage.
1721 if ( (maxnum > minnum+1)
1722 && (retries < (num_ips + 5)) ){
1723 struct public_ip_list *tmp;
1725 /* Reassign one of maxnode's VNNs */
1726 for (tmp=all_ips;tmp;tmp=tmp->next) {
1727 if (tmp->pnn == maxnode) {
1728 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1737 static void lcp2_init(struct ctdb_context *tmp_ctx,
1738 struct ctdb_ipflags *ipflags,
1739 struct public_ip_list *all_ips,
1740 uint32_t *force_rebalance_nodes,
1741 uint32_t **lcp2_imbalances,
1742 bool **rebalance_candidates)
1745 struct public_ip_list *tmp_ip;
1747 numnodes = talloc_array_length(ipflags);
1749 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1750 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1751 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1752 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1754 for (i=0; i<numnodes; i++) {
1755 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1756 /* First step: assume all nodes are candidates */
1757 (*rebalance_candidates)[i] = true;
1760 /* 2nd step: if a node has IPs assigned then it must have been
1761 * healthy before, so we remove it from consideration. This
1762 * is overkill but is all we have because we don't maintain
1763 * state between takeover runs. An alternative would be to
1764 * keep state and invalidate it every time the recovery master
1767 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1768 if (tmp_ip->pnn != -1) {
1769 (*rebalance_candidates)[tmp_ip->pnn] = false;
1773 /* 3rd step: if a node is forced to re-balance then
1774 we allow failback onto the node */
1775 if (force_rebalance_nodes == NULL) {
1778 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1779 uint32_t pnn = force_rebalance_nodes[i];
1780 if (pnn >= numnodes) {
1782 (__location__ "unknown node %u\n", pnn));
1787 ("Forcing rebalancing of IPs to node %u\n", pnn));
1788 (*rebalance_candidates)[pnn] = true;
1792 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1793 * the IP/node combination that will cost the least.
1795 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1796 struct ctdb_ipflags *ipflags,
1797 struct public_ip_list *all_ips,
1798 uint32_t *lcp2_imbalances)
1800 struct public_ip_list *tmp_ip;
1801 int dstnode, numnodes;
1804 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1805 struct public_ip_list *minip;
1807 bool should_loop = true;
1808 bool have_unassigned = true;
1810 numnodes = talloc_array_length(ipflags);
1812 while (have_unassigned && should_loop) {
1813 should_loop = false;
1815 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1816 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1822 /* loop over each unassigned ip. */
1823 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1824 if (tmp_ip->pnn != -1) {
1828 for (dstnode=0; dstnode<numnodes; dstnode++) {
1829 /* only check nodes that can actually takeover this ip */
1830 if (!can_node_takeover_ip(ctdb, dstnode,
1833 /* no it couldnt so skip to the next node */
1837 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1838 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1839 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1840 ctdb_addr_to_str(&(tmp_ip->addr)),
1842 dstimbl - lcp2_imbalances[dstnode]));
1845 if ((minnode == -1) || (dstdsum < mindsum)) {
1855 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1857 /* If we found one then assign it to the given node. */
1858 if (minnode != -1) {
1859 minip->pnn = minnode;
1860 lcp2_imbalances[minnode] = minimbl;
1861 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1862 ctdb_addr_to_str(&(minip->addr)),
1867 /* There might be a better way but at least this is clear. */
1868 have_unassigned = false;
1869 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1870 if (tmp_ip->pnn == -1) {
1871 have_unassigned = true;
1876 /* We know if we have an unassigned addresses so we might as
1879 if (have_unassigned) {
1880 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1881 if (tmp_ip->pnn == -1) {
1882 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1883 ctdb_addr_to_str(&tmp_ip->addr)));
1889 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1890 * to move IPs from, determines the best IP/destination node
1891 * combination to move from the source node.
1893 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1894 struct ctdb_ipflags *ipflags,
1895 struct public_ip_list *all_ips,
1897 uint32_t *lcp2_imbalances,
1898 bool *rebalance_candidates)
1900 int dstnode, mindstnode, numnodes;
1901 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1902 uint32_t minsrcimbl, mindstimbl;
1903 struct public_ip_list *minip;
1904 struct public_ip_list *tmp_ip;
1906 /* Find an IP and destination node that best reduces imbalance. */
1913 numnodes = talloc_array_length(ipflags);
1915 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1916 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1917 srcnode, lcp2_imbalances[srcnode]));
1919 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1920 /* Only consider addresses on srcnode. */
1921 if (tmp_ip->pnn != srcnode) {
1925 /* What is this IP address costing the source node? */
1926 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1927 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1929 /* Consider this IP address would cost each potential
1930 * destination node. Destination nodes are limited to
1931 * those that are newly healthy, since we don't want
1932 * to do gratuitous failover of IPs just to make minor
1933 * balance improvements.
1935 for (dstnode=0; dstnode<numnodes; dstnode++) {
1936 if (!rebalance_candidates[dstnode]) {
1940 /* only check nodes that can actually takeover this ip */
1941 if (!can_node_takeover_ip(ctdb, dstnode,
1942 ipflags[dstnode], tmp_ip)) {
1943 /* no it couldnt so skip to the next node */
1947 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1948 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1949 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1951 ctdb_addr_to_str(&(tmp_ip->addr)),
1954 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1955 (dstdsum < srcdsum) && \
1956 ((mindstnode == -1) || \
1957 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1960 minsrcimbl = srcimbl;
1961 mindstnode = dstnode;
1962 mindstimbl = dstimbl;
1966 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1968 if (mindstnode != -1) {
1969 /* We found a move that makes things better... */
1970 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1971 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1972 ctdb_addr_to_str(&(minip->addr)),
1973 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1976 lcp2_imbalances[srcnode] = minsrcimbl;
1977 lcp2_imbalances[mindstnode] = mindstimbl;
1978 minip->pnn = mindstnode;
1987 struct lcp2_imbalance_pnn {
1992 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1994 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1995 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1997 if (lipa->imbalance > lipb->imbalance) {
1999 } else if (lipa->imbalance == lipb->imbalance) {
2006 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2007 * node with the highest LCP2 imbalance, and then determines the best
2008 * IP/destination node combination to move from the source node.
2010 static void lcp2_failback(struct ctdb_context *ctdb,
2011 struct ctdb_ipflags *ipflags,
2012 struct public_ip_list *all_ips,
2013 uint32_t *lcp2_imbalances,
2014 bool *rebalance_candidates)
2017 struct lcp2_imbalance_pnn * lips;
2020 numnodes = talloc_array_length(ipflags);
2023 /* Put the imbalances and nodes into an array, sort them and
2024 * iterate through candidates. Usually the 1st one will be
2025 * used, so this doesn't cost much...
2027 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2028 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2029 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2030 for (i=0; i<numnodes; i++) {
2031 lips[i].imbalance = lcp2_imbalances[i];
2033 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2035 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2036 lcp2_cmp_imbalance_pnn);
2039 for (i=0; i<numnodes; i++) {
2040 /* This means that all nodes had 0 or 1 addresses, so
2041 * can't be imbalanced.
2043 if (lips[i].imbalance == 0) {
2047 if (lcp2_failback_candidate(ctdb,
2052 rebalance_candidates)) {
2064 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2065 struct ctdb_ipflags *ipflags,
2066 struct public_ip_list *all_ips)
2068 struct public_ip_list *tmp_ip;
2070 /* verify that the assigned nodes can serve that public ip
2071 and set it to -1 if not
2073 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2074 if (tmp_ip->pnn == -1) {
2077 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2078 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2079 /* this node can not serve this ip. */
2080 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2081 ctdb_addr_to_str(&(tmp_ip->addr)),
2088 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2089 struct ctdb_ipflags *ipflags,
2090 struct public_ip_list *all_ips)
2092 struct public_ip_list *tmp_ip;
2095 numnodes = talloc_array_length(ipflags);
2097 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2098 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2099 * always be allocated the same way for a specific set of
2100 * available/unavailable nodes.
2103 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2104 tmp_ip->pnn = i % numnodes;
2107 /* IP failback doesn't make sense with deterministic
2108 * IPs, since the modulo step above implicitly fails
2109 * back IPs to their "home" node.
2111 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2112 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2115 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2117 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2119 /* No failback here! */
2122 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2123 struct ctdb_ipflags *ipflags,
2124 struct public_ip_list *all_ips)
2126 /* This should be pushed down into basic_failback. */
2127 struct public_ip_list *tmp_ip;
2129 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2133 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2135 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2137 /* If we don't want IPs to fail back then don't rebalance IPs. */
2138 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2142 /* Now, try to make sure the ip adresses are evenly distributed
2145 basic_failback(ctdb, ipflags, all_ips, num_ips);
2148 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2149 struct ctdb_ipflags *ipflags,
2150 struct public_ip_list *all_ips,
2151 uint32_t *force_rebalance_nodes)
2153 uint32_t *lcp2_imbalances;
2154 bool *rebalance_candidates;
2155 int numnodes, num_rebalance_candidates, i;
2157 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2159 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2161 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2162 &lcp2_imbalances, &rebalance_candidates);
2164 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2166 /* If we don't want IPs to fail back then don't rebalance IPs. */
2167 if (1 == ctdb->ipalloc_state->no_ip_failback) {
2171 /* It is only worth continuing if we have suitable target
2172 * nodes to transfer IPs to. This check is much cheaper than
2175 numnodes = talloc_array_length(ipflags);
2176 num_rebalance_candidates = 0;
2177 for (i=0; i<numnodes; i++) {
2178 if (rebalance_candidates[i]) {
2179 num_rebalance_candidates++;
2182 if (num_rebalance_candidates == 0) {
2186 /* Now, try to make sure the ip adresses are evenly distributed
2189 lcp2_failback(ctdb, ipflags, all_ips,
2190 lcp2_imbalances, rebalance_candidates);
2193 talloc_free(tmp_ctx);
2196 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2200 for (i=0;i<nodemap->num;i++) {
2201 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2202 /* Found one completely healthy node */
2210 /* The calculation part of the IP allocation algorithm. */
2211 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2212 struct ctdb_ipflags *ipflags,
2213 struct public_ip_list **all_ips_p,
2214 uint32_t *force_rebalance_nodes)
2216 /* since nodes only know about those public addresses that
2217 can be served by that particular node, no single node has
2218 a full list of all public addresses that exist in the cluster.
2219 Walk over all node structures and create a merged list of
2220 all public addresses that exist in the cluster.
2222 keep the tree of ips around as ctdb->ip_tree
2224 *all_ips_p = create_merged_ip_list(ctdb);
2226 switch (ctdb->ipalloc_state->algorithm) {
2228 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2230 case IPALLOC_DETERMINISTIC:
2231 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2233 case IPALLOC_NONDETERMINISTIC:
2234 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2238 /* at this point ->pnn is the node which will own each IP
2239 or -1 if there is no node that can cover this ip
2245 struct get_tunable_callback_data {
2246 const char *tunable;
2251 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2252 int32_t res, TDB_DATA outdata,
2255 struct get_tunable_callback_data *cd =
2256 (struct get_tunable_callback_data *)callback;
2260 /* Already handled in fail callback */
2264 if (outdata.dsize != sizeof(uint32_t)) {
2265 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2266 cd->tunable, pnn, (int)sizeof(uint32_t),
2267 (int)outdata.dsize));
2272 size = talloc_array_length(cd->out);
2274 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2275 cd->tunable, pnn, size));
2280 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2283 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2284 int32_t res, TDB_DATA outdata,
2287 struct get_tunable_callback_data *cd =
2288 (struct get_tunable_callback_data *)callback;
2293 ("Timed out getting tunable \"%s\" from node %d\n",
2299 DEBUG(DEBUG_WARNING,
2300 ("Tunable \"%s\" not implemented on node %d\n",
2305 ("Unexpected error getting tunable \"%s\" from node %d\n",
2311 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2312 TALLOC_CTX *tmp_ctx,
2313 struct ctdb_node_map_old *nodemap,
2314 const char *tunable,
2315 uint32_t default_value)
2318 struct ctdb_control_get_tunable *t;
2321 struct get_tunable_callback_data callback_data;
2324 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2325 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2326 for (i=0; i<nodemap->num; i++) {
2327 tvals[i] = default_value;
2330 callback_data.out = tvals;
2331 callback_data.tunable = tunable;
2332 callback_data.fatal = false;
2334 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2335 data.dptr = talloc_size(tmp_ctx, data.dsize);
2336 t = (struct ctdb_control_get_tunable *)data.dptr;
2337 t->length = strlen(tunable)+1;
2338 memcpy(t->name, tunable, t->length);
2339 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2340 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2341 nodes, 0, TAKEOVER_TIMEOUT(),
2343 get_tunable_callback,
2344 get_tunable_fail_callback,
2345 &callback_data) != 0) {
2346 if (callback_data.fatal) {
2352 talloc_free(data.dptr);
2357 /* Set internal flags for IP allocation:
2359 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2360 * Set NOIPHOST ip flag for each INACTIVE node
2361 * if all nodes are disabled:
2362 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2364 * Set NOIPHOST ip flags for disabled nodes
2366 static struct ctdb_ipflags *
2367 set_ipflags_internal(struct ctdb_context *ctdb,
2368 TALLOC_CTX *tmp_ctx,
2369 struct ctdb_node_map_old *nodemap,
2370 uint32_t *tval_noiptakeover,
2371 uint32_t *tval_noiphostonalldisabled)
2374 struct ctdb_ipflags *ipflags;
2376 /* Clear IP flags - implicit due to talloc_zero */
2377 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2378 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2380 for (i=0;i<nodemap->num;i++) {
2381 /* Can not take IPs on node with NoIPTakeover set */
2382 if (tval_noiptakeover[i] != 0) {
2383 ipflags[i].noiptakeover = true;
2386 /* Can not host IPs on INACTIVE node */
2387 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2388 ipflags[i].noiphost = true;
2392 if (all_nodes_are_disabled(nodemap)) {
2393 /* If all nodes are disabled, can not host IPs on node
2394 * with NoIPHostOnAllDisabled set
2396 for (i=0;i<nodemap->num;i++) {
2397 if (tval_noiphostonalldisabled[i] != 0) {
2398 ipflags[i].noiphost = true;
2402 /* If some nodes are not disabled, then can not host
2403 * IPs on DISABLED node
2405 for (i=0;i<nodemap->num;i++) {
2406 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2407 ipflags[i].noiphost = true;
2415 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2416 TALLOC_CTX *tmp_ctx,
2417 struct ctdb_node_map_old *nodemap)
2419 uint32_t *tval_noiptakeover;
2420 uint32_t *tval_noiphostonalldisabled;
2421 struct ctdb_ipflags *ipflags;
2424 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2426 if (tval_noiptakeover == NULL) {
2430 tval_noiphostonalldisabled =
2431 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2432 "NoIPHostOnAllDisabled", 0);
2433 if (tval_noiphostonalldisabled == NULL) {
2434 /* Caller frees tmp_ctx */
2438 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2440 tval_noiphostonalldisabled);
2442 talloc_free(tval_noiptakeover);
2443 talloc_free(tval_noiphostonalldisabled);
2448 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2449 TALLOC_CTX *mem_ctx)
2451 struct ipalloc_state *ipalloc_state =
2452 talloc_zero(mem_ctx, struct ipalloc_state);
2453 if (ipalloc_state == NULL) {
2454 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2458 ipalloc_state->num = ctdb->num_nodes;
2459 ipalloc_state->known_public_ips =
2460 talloc_zero_array(ipalloc_state,
2461 struct ctdb_public_ip_list_old *,
2462 ipalloc_state->num);
2463 if (ipalloc_state->known_public_ips == NULL) {
2464 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2465 talloc_free(ipalloc_state);
2468 ipalloc_state->available_public_ips =
2469 talloc_zero_array(ipalloc_state,
2470 struct ctdb_public_ip_list_old *,
2471 ipalloc_state->num);
2472 if (ipalloc_state->available_public_ips == NULL) {
2473 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2474 talloc_free(ipalloc_state);
2478 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2479 ipalloc_state->algorithm = IPALLOC_LCP2;
2480 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2481 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2483 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2486 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2488 return ipalloc_state;
2491 struct iprealloc_callback_data {
2494 client_async_callback fail_callback;
2495 void *fail_callback_data;
2496 struct ctdb_node_map_old *nodemap;
2499 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2500 int32_t res, TDB_DATA outdata,
2504 struct iprealloc_callback_data *cd =
2505 (struct iprealloc_callback_data *)callback;
2507 numnodes = talloc_array_length(cd->retry_nodes);
2508 if (pnn > numnodes) {
2510 ("ipreallocated failure from node %d, "
2511 "but only %d nodes in nodemap\n",
2516 /* Can't run the "ipreallocated" event on a INACTIVE node */
2517 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2518 DEBUG(DEBUG_WARNING,
2519 ("ipreallocated failed on inactive node %d, ignoring\n",
2526 /* If the control timed out then that's a real error,
2527 * so call the real fail callback
2529 if (cd->fail_callback) {
2530 cd->fail_callback(ctdb, pnn, res, outdata,
2531 cd->fail_callback_data);
2533 DEBUG(DEBUG_WARNING,
2534 ("iprealloc timed out but no callback registered\n"));
2538 /* If not a timeout then either the ipreallocated
2539 * eventscript (or some setup) failed. This might
2540 * have failed because the IPREALLOCATED control isn't
2541 * implemented - right now there is no way of knowing
2542 * because the error codes are all folded down to -1.
2543 * Consider retrying using EVENTSCRIPT control...
2545 DEBUG(DEBUG_WARNING,
2546 ("ipreallocated failure from node %d, flagging retry\n",
2548 cd->retry_nodes[pnn] = true;
2553 struct takeover_callback_data {
2555 client_async_callback fail_callback;
2556 void *fail_callback_data;
2557 struct ctdb_node_map_old *nodemap;
2560 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2561 uint32_t node_pnn, int32_t res,
2562 TDB_DATA outdata, void *callback_data)
2564 struct takeover_callback_data *cd =
2565 talloc_get_type_abort(callback_data,
2566 struct takeover_callback_data);
2569 for (i = 0; i < cd->nodemap->num; i++) {
2570 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2575 if (i == cd->nodemap->num) {
2576 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2580 if (!cd->node_failed[i]) {
2581 cd->node_failed[i] = true;
2582 cd->fail_callback(ctdb, node_pnn, res, outdata,
2583 cd->fail_callback_data);
2588 make any IP alias changes for public addresses that are necessary
2590 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2591 uint32_t *force_rebalance_nodes,
2592 client_async_callback fail_callback, void *callback_data)
2595 struct ctdb_public_ip ip;
2597 struct public_ip_list *all_ips, *tmp_ip;
2599 struct timeval timeout;
2600 struct client_async_data *async_data;
2601 struct ctdb_client_control_state *state;
2602 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2603 struct ctdb_ipflags *ipflags;
2604 struct ipalloc_state *ipalloc_state;
2605 struct takeover_callback_data *takeover_data;
2606 struct iprealloc_callback_data iprealloc_data;
2611 * ip failover is completely disabled, just send out the
2612 * ipreallocated event.
2614 if (ctdb->tunable.disable_ip_failover != 0) {
2618 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2619 if (ipalloc_state == NULL) {
2620 talloc_free(tmp_ctx);
2623 ctdb->ipalloc_state = ipalloc_state;
2625 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2626 if (ipflags == NULL) {
2627 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2628 talloc_free(tmp_ctx);
2632 /* Fetch known/available public IPs from each active node */
2633 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2635 talloc_free(tmp_ctx);
2639 /* Short-circuit IP allocation if no node has available IPs */
2640 can_host_ips = false;
2641 for (i=0; i < ipalloc_state->num; i++) {
2642 if (ipalloc_state->available_public_ips[i] != NULL) {
2643 can_host_ips = true;
2646 if (!can_host_ips) {
2647 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2651 /* Do the IP reassignment calculations */
2652 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2654 /* Now tell all nodes to release any public IPs should not
2655 * host. This will be a NOOP on nodes that don't currently
2656 * hold the given IP.
2658 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2659 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2661 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2662 bool, nodemap->num);
2663 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2664 takeover_data->fail_callback = fail_callback;
2665 takeover_data->fail_callback_data = callback_data;
2666 takeover_data->nodemap = nodemap;
2668 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2669 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2671 async_data->fail_callback = takeover_run_fail_callback;
2672 async_data->callback_data = takeover_data;
2674 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2676 /* Send a RELEASE_IP to all nodes that should not be hosting
2677 * each IP. For each IP, all but one of these will be
2678 * redundant. However, the redundant ones are used to tell
2679 * nodes which node should be hosting the IP so that commands
2680 * like "ctdb ip" can display a particular nodes idea of who
2681 * is hosting what. */
2682 for (i=0;i<nodemap->num;i++) {
2683 /* don't talk to unconnected nodes, but do talk to banned nodes */
2684 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2688 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2689 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2690 /* This node should be serving this
2691 vnn so don't tell it to release the ip
2695 ip.pnn = tmp_ip->pnn;
2696 ip.addr = tmp_ip->addr;
2698 timeout = TAKEOVER_TIMEOUT();
2699 data.dsize = sizeof(ip);
2700 data.dptr = (uint8_t *)&ip;
2701 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2702 0, CTDB_CONTROL_RELEASE_IP, 0,
2705 if (state == NULL) {
2706 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2707 talloc_free(tmp_ctx);
2711 ctdb_client_async_add(async_data, state);
2714 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2715 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2716 talloc_free(tmp_ctx);
2719 talloc_free(async_data);
2722 /* For each IP, send a TAKOVER_IP to the node that should be
2723 * hosting it. Many of these will often be redundant (since
2724 * the allocation won't have changed) but they can be useful
2725 * to recover from inconsistencies. */
2726 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2727 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2729 async_data->fail_callback = fail_callback;
2730 async_data->callback_data = callback_data;
2732 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2733 if (tmp_ip->pnn == -1) {
2734 /* this IP won't be taken over */
2738 ip.pnn = tmp_ip->pnn;
2739 ip.addr = tmp_ip->addr;
2741 timeout = TAKEOVER_TIMEOUT();
2742 data.dsize = sizeof(ip);
2743 data.dptr = (uint8_t *)&ip;
2744 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2745 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2746 data, async_data, &timeout, NULL);
2747 if (state == NULL) {
2748 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2749 talloc_free(tmp_ctx);
2753 ctdb_client_async_add(async_data, state);
2755 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2756 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2757 talloc_free(tmp_ctx);
2763 * Tell all nodes to run eventscripts to process the
2764 * "ipreallocated" event. This can do a lot of things,
2765 * including restarting services to reconfigure them if public
2766 * IPs have moved. Once upon a time this event only used to
2769 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2770 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2771 iprealloc_data.retry_nodes = retry_data;
2772 iprealloc_data.retry_count = 0;
2773 iprealloc_data.fail_callback = fail_callback;
2774 iprealloc_data.fail_callback_data = callback_data;
2775 iprealloc_data.nodemap = nodemap;
2777 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2778 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2779 nodes, 0, TAKEOVER_TIMEOUT(),
2781 NULL, iprealloc_fail_callback,
2784 /* If the control failed then we should retry to any
2785 * nodes flagged by iprealloc_fail_callback using the
2786 * EVENTSCRIPT control. This is a best-effort at
2787 * backward compatiblity when running a mixed cluster
2788 * where some nodes have not yet been upgraded to
2789 * support the IPREALLOCATED control.
2791 DEBUG(DEBUG_WARNING,
2792 ("Retry ipreallocated to some nodes using eventscript control\n"));
2794 nodes = talloc_array(tmp_ctx, uint32_t,
2795 iprealloc_data.retry_count);
2796 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2799 for (i=0; i<nodemap->num; i++) {
2800 if (iprealloc_data.retry_nodes[i]) {
2806 data.dptr = discard_const("ipreallocated");
2807 data.dsize = strlen((char *)data.dptr) + 1;
2808 ret = ctdb_client_async_control(ctdb,
2809 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2810 nodes, 0, TAKEOVER_TIMEOUT(),
2812 NULL, fail_callback,
2815 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2819 talloc_free(tmp_ctx);
2825 destroy a ctdb_client_ip structure
2827 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2829 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2830 ctdb_addr_to_str(&ip->addr),
2831 ntohs(ip->addr.ip.sin_port),
2834 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2839 called by a client to inform us of a TCP connection that it is managing
2840 that should tickled with an ACK when IP takeover is done
2842 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2845 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2846 struct ctdb_connection *tcp_sock = NULL;
2847 struct ctdb_tcp_list *tcp;
2848 struct ctdb_connection t;
2851 struct ctdb_client_ip *ip;
2852 struct ctdb_vnn *vnn;
2853 ctdb_sock_addr addr;
2855 /* If we don't have public IPs, tickles are useless */
2856 if (ctdb->vnn == NULL) {
2860 tcp_sock = (struct ctdb_connection *)indata.dptr;
2862 addr = tcp_sock->src;
2863 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2864 addr = tcp_sock->dst;
2865 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2868 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2869 vnn = find_public_ip_vnn(ctdb, &addr);
2871 switch (addr.sa.sa_family) {
2873 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2874 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2875 ctdb_addr_to_str(&addr)));
2879 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2880 ctdb_addr_to_str(&addr)));
2883 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2889 if (vnn->pnn != ctdb->pnn) {
2890 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2891 ctdb_addr_to_str(&addr),
2892 client_id, client->pid));
2893 /* failing this call will tell smbd to die */
2897 ip = talloc(client, struct ctdb_client_ip);
2898 CTDB_NO_MEMORY(ctdb, ip);
2902 ip->client_id = client_id;
2903 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2904 DLIST_ADD(ctdb->client_ip_list, ip);
2906 tcp = talloc(client, struct ctdb_tcp_list);
2907 CTDB_NO_MEMORY(ctdb, tcp);
2909 tcp->connection.src = tcp_sock->src;
2910 tcp->connection.dst = tcp_sock->dst;
2912 DLIST_ADD(client->tcp_list, tcp);
2914 t.src = tcp_sock->src;
2915 t.dst = tcp_sock->dst;
2917 data.dptr = (uint8_t *)&t;
2918 data.dsize = sizeof(t);
2920 switch (addr.sa.sa_family) {
2922 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2923 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2924 ctdb_addr_to_str(&tcp_sock->src),
2925 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2928 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2929 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2930 ctdb_addr_to_str(&tcp_sock->src),
2931 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2934 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2938 /* tell all nodes about this tcp connection */
2939 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2940 CTDB_CONTROL_TCP_ADD,
2941 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2943 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2951 find a tcp address on a list
2953 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2954 struct ctdb_connection *tcp)
2958 if (array == NULL) {
2962 for (i=0;i<array->num;i++) {
2963 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2964 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2965 return &array->connections[i];
2974 called by a daemon to inform us of a TCP connection that one of its
2975 clients managing that should tickled with an ACK when IP takeover is
2978 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2980 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2981 struct ctdb_tcp_array *tcparray;
2982 struct ctdb_connection tcp;
2983 struct ctdb_vnn *vnn;
2985 /* If we don't have public IPs, tickles are useless */
2986 if (ctdb->vnn == NULL) {
2990 vnn = find_public_ip_vnn(ctdb, &p->dst);
2992 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2993 ctdb_addr_to_str(&p->dst)));
2999 tcparray = vnn->tcp_array;
3001 /* If this is the first tickle */
3002 if (tcparray == NULL) {
3003 tcparray = talloc(vnn, struct ctdb_tcp_array);
3004 CTDB_NO_MEMORY(ctdb, tcparray);
3005 vnn->tcp_array = tcparray;
3008 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3009 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3011 tcparray->connections[tcparray->num].src = p->src;
3012 tcparray->connections[tcparray->num].dst = p->dst;
3015 if (tcp_update_needed) {
3016 vnn->tcp_update_needed = true;
3022 /* Do we already have this tickle ?*/
3025 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3026 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3027 ctdb_addr_to_str(&tcp.dst),
3028 ntohs(tcp.dst.ip.sin_port),
3033 /* A new tickle, we must add it to the array */
3034 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3035 struct ctdb_connection,
3037 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3039 tcparray->connections[tcparray->num].src = p->src;
3040 tcparray->connections[tcparray->num].dst = p->dst;
3043 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3044 ctdb_addr_to_str(&tcp.dst),
3045 ntohs(tcp.dst.ip.sin_port),
3048 if (tcp_update_needed) {
3049 vnn->tcp_update_needed = true;
3057 called by a daemon to inform us of a TCP connection that one of its
3058 clients managing that should tickled with an ACK when IP takeover is
3061 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3063 struct ctdb_connection *tcpp;
3064 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3067 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3068 ctdb_addr_to_str(&conn->dst)));
3072 /* if the array is empty we cant remove it
3073 and we don't need to do anything
3075 if (vnn->tcp_array == NULL) {
3076 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3077 ctdb_addr_to_str(&conn->dst),
3078 ntohs(conn->dst.ip.sin_port)));
3083 /* See if we know this connection
3084 if we don't know this connection then we dont need to do anything
3086 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3088 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3089 ctdb_addr_to_str(&conn->dst),
3090 ntohs(conn->dst.ip.sin_port)));
3095 /* We need to remove this entry from the array.
3096 Instead of allocating a new array and copying data to it
3097 we cheat and just copy the last entry in the existing array
3098 to the entry that is to be removed and just shring the
3101 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3102 vnn->tcp_array->num--;
3104 /* If we deleted the last entry we also need to remove the entire array
3106 if (vnn->tcp_array->num == 0) {
3107 talloc_free(vnn->tcp_array);
3108 vnn->tcp_array = NULL;
3111 vnn->tcp_update_needed = true;
3113 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3114 ctdb_addr_to_str(&conn->src),
3115 ntohs(conn->src.ip.sin_port)));
3120 called by a daemon to inform us of a TCP connection that one of its
3121 clients used are no longer needed in the tickle database
3123 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3125 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3127 /* If we don't have public IPs, tickles are useless */
3128 if (ctdb->vnn == NULL) {
3132 ctdb_remove_connection(ctdb, conn);
3139 Called when another daemon starts - causes all tickles for all
3140 public addresses we are serving to be sent to the new node on the
3141 next check. This actually causes the next scheduled call to
3142 tdb_update_tcp_tickles() to update all nodes. This is simple and
3143 doesn't require careful error handling.
3145 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3147 struct ctdb_vnn *vnn;
3149 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3150 (unsigned long) pnn));
3152 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3153 vnn->tcp_update_needed = true;
3161 called when a client structure goes away - hook to remove
3162 elements from the tcp_list in all daemons
3164 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3166 while (client->tcp_list) {
3167 struct ctdb_tcp_list *tcp = client->tcp_list;
3168 DLIST_REMOVE(client->tcp_list, tcp);
3169 ctdb_remove_connection(client->ctdb, &tcp->connection);
3174 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3176 struct ctdb_vnn *vnn;
3179 if (ctdb->tunable.disable_ip_failover == 1) {
3183 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3184 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3185 ctdb_vnn_unassign_iface(ctdb, vnn);
3192 /* Don't allow multiple releases at once. Some code,
3193 * particularly ctdb_tickle_sentenced_connections() is
3195 if (vnn->update_in_flight) {
3196 DEBUG(DEBUG_WARNING,
3198 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3199 ctdb_addr_to_str(&vnn->public_address),
3200 vnn->public_netmask_bits,
3201 ctdb_vnn_iface_string(vnn)));
3204 vnn->update_in_flight = true;
3206 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3207 ctdb_addr_to_str(&vnn->public_address),
3208 vnn->public_netmask_bits,
3209 ctdb_vnn_iface_string(vnn)));
3211 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3212 ctdb_vnn_iface_string(vnn),
3213 ctdb_addr_to_str(&vnn->public_address),
3214 vnn->public_netmask_bits);
3215 release_kill_clients(ctdb, &vnn->public_address);
3216 ctdb_vnn_unassign_iface(ctdb, vnn);
3217 vnn->update_in_flight = false;
3221 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3226 get list of public IPs
3228 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3229 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3232 struct ctdb_public_ip_list_old *ips;
3233 struct ctdb_vnn *vnn;
3234 bool only_available = false;
3236 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3237 only_available = true;
3240 /* count how many public ip structures we have */
3242 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3246 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3247 num*sizeof(struct ctdb_public_ip);
3248 ips = talloc_zero_size(outdata, len);
3249 CTDB_NO_MEMORY(ctdb, ips);
3252 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3253 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3256 ips->ips[i].pnn = vnn->pnn;
3257 ips->ips[i].addr = vnn->public_address;
3261 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3262 i*sizeof(struct ctdb_public_ip);
3264 outdata->dsize = len;
3265 outdata->dptr = (uint8_t *)ips;
3271 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3272 struct ctdb_req_control_old *c,
3277 ctdb_sock_addr *addr;
3278 struct ctdb_public_ip_info_old *info;
3279 struct ctdb_vnn *vnn;
3281 addr = (ctdb_sock_addr *)indata.dptr;
3283 vnn = find_public_ip_vnn(ctdb, addr);
3285 /* if it is not a public ip it could be our 'single ip' */
3286 if (ctdb->single_ip_vnn) {
3287 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3288 vnn = ctdb->single_ip_vnn;
3293 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3294 "'%s'not a public address\n",
3295 ctdb_addr_to_str(addr)));
3299 /* count how many public ip structures we have */
3301 for (;vnn->ifaces[num];) {
3305 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3306 num*sizeof(struct ctdb_iface);
3307 info = talloc_zero_size(outdata, len);
3308 CTDB_NO_MEMORY(ctdb, info);
3310 info->ip.addr = vnn->public_address;
3311 info->ip.pnn = vnn->pnn;
3312 info->active_idx = 0xFFFFFFFF;
3314 for (i=0; vnn->ifaces[i]; i++) {
3315 struct ctdb_interface *cur;
3317 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3319 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3323 if (vnn->iface == cur) {
3324 info->active_idx = i;
3326 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3327 info->ifaces[i].link_state = cur->link_up;
3328 info->ifaces[i].references = cur->references;
3331 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3332 i*sizeof(struct ctdb_iface);
3334 outdata->dsize = len;
3335 outdata->dptr = (uint8_t *)info;
3340 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3341 struct ctdb_req_control_old *c,
3345 struct ctdb_iface_list_old *ifaces;
3346 struct ctdb_interface *cur;
3348 /* count how many public ip structures we have */
3350 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3354 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3355 num*sizeof(struct ctdb_iface);
3356 ifaces = talloc_zero_size(outdata, len);
3357 CTDB_NO_MEMORY(ctdb, ifaces);
3360 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3361 strcpy(ifaces->ifaces[i].name, cur->name);
3362 ifaces->ifaces[i].link_state = cur->link_up;
3363 ifaces->ifaces[i].references = cur->references;
3367 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3368 i*sizeof(struct ctdb_iface);
3370 outdata->dsize = len;
3371 outdata->dptr = (uint8_t *)ifaces;
3376 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3377 struct ctdb_req_control_old *c,
3380 struct ctdb_iface *info;
3381 struct ctdb_interface *iface;
3382 bool link_up = false;
3384 info = (struct ctdb_iface *)indata.dptr;
3386 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3387 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3388 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3389 len, len, info->name));
3393 switch (info->link_state) {
3401 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3402 (unsigned int)info->link_state));
3406 if (info->references != 0) {
3407 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3408 (unsigned int)info->references));
3412 iface = ctdb_find_iface(ctdb, info->name);
3413 if (iface == NULL) {
3417 if (link_up == iface->link_up) {
3421 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3422 ("iface[%s] has changed it's link status %s => %s\n",
3424 iface->link_up?"up":"down",
3425 link_up?"up":"down"));
3427 iface->link_up = link_up;
3433 structure containing the listening socket and the list of tcp connections
3434 that the ctdb daemon is to kill
3436 struct ctdb_kill_tcp {
3437 struct ctdb_vnn *vnn;
3438 struct ctdb_context *ctdb;
3440 struct tevent_fd *fde;
3441 trbt_tree_t *connections;
3446 a tcp connection that is to be killed
3448 struct ctdb_killtcp_con {
3449 ctdb_sock_addr src_addr;
3450 ctdb_sock_addr dst_addr;
3452 struct ctdb_kill_tcp *killtcp;
3455 /* this function is used to create a key to represent this socketpair
3456 in the killtcp tree.
3457 this key is used to insert and lookup matching socketpairs that are
3458 to be tickled and RST
3460 #define KILLTCP_KEYLEN 10
3461 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3463 static uint32_t key[KILLTCP_KEYLEN];
3465 bzero(key, sizeof(key));
3467 if (src->sa.sa_family != dst->sa.sa_family) {
3468 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3472 switch (src->sa.sa_family) {
3474 key[0] = dst->ip.sin_addr.s_addr;
3475 key[1] = src->ip.sin_addr.s_addr;
3476 key[2] = dst->ip.sin_port;
3477 key[3] = src->ip.sin_port;
3480 uint32_t *dst6_addr32 =
3481 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3482 uint32_t *src6_addr32 =
3483 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3484 key[0] = dst6_addr32[3];
3485 key[1] = src6_addr32[3];
3486 key[2] = dst6_addr32[2];
3487 key[3] = src6_addr32[2];
3488 key[4] = dst6_addr32[1];
3489 key[5] = src6_addr32[1];
3490 key[6] = dst6_addr32[0];
3491 key[7] = src6_addr32[0];
3492 key[8] = dst->ip6.sin6_port;
3493 key[9] = src->ip6.sin6_port;
3497 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3505 called when we get a read event on the raw socket
3507 static void capture_tcp_handler(struct tevent_context *ev,
3508 struct tevent_fd *fde,
3509 uint16_t flags, void *private_data)
3511 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3512 struct ctdb_killtcp_con *con;
3513 ctdb_sock_addr src, dst;
3514 uint32_t ack_seq, seq;
3516 if (!(flags & TEVENT_FD_READ)) {
3520 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3521 killtcp->private_data,
3523 &ack_seq, &seq) != 0) {
3524 /* probably a non-tcp ACK packet */
3528 /* check if we have this guy in our list of connections
3531 con = trbt_lookuparray32(killtcp->connections,
3532 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3534 /* no this was some other packet we can just ignore */
3538 /* This one has been tickled !
3539 now reset him and remove him from the list.
3541 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3542 ntohs(con->dst_addr.ip.sin_port),
3543 ctdb_addr_to_str(&con->src_addr),
3544 ntohs(con->src_addr.ip.sin_port)));
3546 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3551 /* when traversing the list of all tcp connections to send tickle acks to
3552 (so that we can capture the ack coming back and kill the connection
3554 this callback is called for each connection we are currently trying to kill
3556 static int tickle_connection_traverse(void *param, void *data)
3558 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3560 /* have tried too many times, just give up */
3561 if (con->count >= 5) {
3562 /* can't delete in traverse: reparent to delete_cons */
3563 talloc_steal(param, con);
3567 /* othervise, try tickling it again */
3570 (ctdb_sock_addr *)&con->dst_addr,
3571 (ctdb_sock_addr *)&con->src_addr,
3578 called every second until all sentenced connections have been reset
3580 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3581 struct tevent_timer *te,
3582 struct timeval t, void *private_data)
3584 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3585 void *delete_cons = talloc_new(NULL);
3587 /* loop over all connections sending tickle ACKs */
3588 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3590 /* now we've finished traverse, it's safe to do deletion. */
3591 talloc_free(delete_cons);
3593 /* If there are no more connections to kill we can remove the
3594 entire killtcp structure
3596 if ( (killtcp->connections == NULL) ||
3597 (killtcp->connections->root == NULL) ) {
3598 talloc_free(killtcp);
3602 /* try tickling them again in a seconds time
3604 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3605 timeval_current_ofs(1, 0),
3606 ctdb_tickle_sentenced_connections, killtcp);
3610 destroy the killtcp structure
3612 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3614 struct ctdb_vnn *tmpvnn;
3616 /* verify that this vnn is still active */
3617 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3618 if (tmpvnn == killtcp->vnn) {
3623 if (tmpvnn == NULL) {
3627 if (killtcp->vnn->killtcp != killtcp) {
3631 killtcp->vnn->killtcp = NULL;
3637 /* nothing fancy here, just unconditionally replace any existing
3638 connection structure with the new one.
3640 don't even free the old one if it did exist, that one is talloc_stolen
3641 by the same node in the tree anyway and will be deleted when the new data
3644 static void *add_killtcp_callback(void *parm, void *data)
3650 add a tcp socket to the list of connections we want to RST
3652 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3656 ctdb_sock_addr src, dst;
3657 struct ctdb_kill_tcp *killtcp;
3658 struct ctdb_killtcp_con *con;
3659 struct ctdb_vnn *vnn;
3661 ctdb_canonicalize_ip(s, &src);
3662 ctdb_canonicalize_ip(d, &dst);
3664 vnn = find_public_ip_vnn(ctdb, &dst);
3666 vnn = find_public_ip_vnn(ctdb, &src);
3669 /* if it is not a public ip it could be our 'single ip' */
3670 if (ctdb->single_ip_vnn) {
3671 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3672 vnn = ctdb->single_ip_vnn;
3677 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3681 killtcp = vnn->killtcp;
3683 /* If this is the first connection to kill we must allocate
3686 if (killtcp == NULL) {
3687 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3688 CTDB_NO_MEMORY(ctdb, killtcp);
3691 killtcp->ctdb = ctdb;
3692 killtcp->capture_fd = -1;
3693 killtcp->connections = trbt_create(killtcp, 0);
3695 vnn->killtcp = killtcp;
3696 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3701 /* create a structure that describes this connection we want to
3702 RST and store it in killtcp->connections
3704 con = talloc(killtcp, struct ctdb_killtcp_con);
3705 CTDB_NO_MEMORY(ctdb, con);
3706 con->src_addr = src;
3707 con->dst_addr = dst;
3709 con->killtcp = killtcp;
3712 trbt_insertarray32_callback(killtcp->connections,
3713 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3714 add_killtcp_callback, con);
3717 If we don't have a socket to listen on yet we must create it
3719 if (killtcp->capture_fd == -1) {
3720 const char *iface = ctdb_vnn_iface_string(vnn);
3721 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3722 if (killtcp->capture_fd == -1) {
3723 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3724 "socket on iface '%s' for killtcp (%s)\n",
3725 iface, strerror(errno)));
3731 if (killtcp->fde == NULL) {
3732 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3733 killtcp->capture_fd,
3735 capture_tcp_handler, killtcp);
3736 tevent_fd_set_auto_close(killtcp->fde);
3738 /* We also need to set up some events to tickle all these connections
3739 until they are all reset
3741 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3742 ctdb_tickle_sentenced_connections, killtcp);
3745 /* tickle him once now */
3754 talloc_free(vnn->killtcp);
3755 vnn->killtcp = NULL;
3760 kill a TCP connection.
3762 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3764 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3766 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3770 called by a daemon to inform us of the entire list of TCP tickles for
3771 a particular public address.
3772 this control should only be sent by the node that is currently serving
3773 that public address.
3775 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3777 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3778 struct ctdb_tcp_array *tcparray;
3779 struct ctdb_vnn *vnn;
3781 /* We must at least have tickles.num or else we cant verify the size
3782 of the received data blob
3784 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3785 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3789 /* verify that the size of data matches what we expect */
3790 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3791 + sizeof(struct ctdb_connection) * list->num) {
3792 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3796 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3797 ctdb_addr_to_str(&list->addr)));
3799 vnn = find_public_ip_vnn(ctdb, &list->addr);
3801 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3802 ctdb_addr_to_str(&list->addr)));
3807 /* remove any old ticklelist we might have */
3808 talloc_free(vnn->tcp_array);
3809 vnn->tcp_array = NULL;
3811 tcparray = talloc(vnn, struct ctdb_tcp_array);
3812 CTDB_NO_MEMORY(ctdb, tcparray);
3814 tcparray->num = list->num;
3816 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3817 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3819 memcpy(tcparray->connections, &list->connections[0],
3820 sizeof(struct ctdb_connection)*tcparray->num);
3822 /* We now have a new fresh tickle list array for this vnn */
3823 vnn->tcp_array = tcparray;
3829 called to return the full list of tickles for the puclic address associated
3830 with the provided vnn
3832 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3834 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3835 struct ctdb_tickle_list_old *list;
3836 struct ctdb_tcp_array *tcparray;
3838 struct ctdb_vnn *vnn;
3840 vnn = find_public_ip_vnn(ctdb, addr);
3842 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3843 ctdb_addr_to_str(addr)));
3848 tcparray = vnn->tcp_array;
3850 num = tcparray->num;
3855 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3856 + sizeof(struct ctdb_connection) * num;
3858 outdata->dptr = talloc_size(outdata, outdata->dsize);
3859 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3860 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3865 memcpy(&list->connections[0], tcparray->connections,
3866 sizeof(struct ctdb_connection) * num);
3874 set the list of all tcp tickles for a public address
3876 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3877 ctdb_sock_addr *addr,
3878 struct ctdb_tcp_array *tcparray)
3882 struct ctdb_tickle_list_old *list;
3885 num = tcparray->num;
3890 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3891 sizeof(struct ctdb_connection) * num;
3892 data.dptr = talloc_size(ctdb, data.dsize);
3893 CTDB_NO_MEMORY(ctdb, data.dptr);
3895 list = (struct ctdb_tickle_list_old *)data.dptr;
3899 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3902 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3903 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3904 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3906 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3910 talloc_free(data.dptr);
3917 perform tickle updates if required
3919 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3920 struct tevent_timer *te,
3921 struct timeval t, void *private_data)
3923 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3925 struct ctdb_vnn *vnn;
3927 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3928 /* we only send out updates for public addresses that
3931 if (ctdb->pnn != vnn->pnn) {
3934 /* We only send out the updates if we need to */
3935 if (!vnn->tcp_update_needed) {
3938 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3939 &vnn->public_address,
3942 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3943 ctdb_addr_to_str(&vnn->public_address)));
3946 ("Sent tickle update for public address %s\n",
3947 ctdb_addr_to_str(&vnn->public_address)));
3948 vnn->tcp_update_needed = false;
3952 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3953 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3954 ctdb_update_tcp_tickles, ctdb);
3958 start periodic update of tcp tickles
3960 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3962 ctdb->tickle_update_context = talloc_new(ctdb);
3964 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3965 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3966 ctdb_update_tcp_tickles, ctdb);
3972 struct control_gratious_arp {
3973 struct ctdb_context *ctdb;
3974 ctdb_sock_addr addr;
3980 send a control_gratuitous arp
3982 static void send_gratious_arp(struct tevent_context *ev,
3983 struct tevent_timer *te,
3984 struct timeval t, void *private_data)
3987 struct control_gratious_arp *arp = talloc_get_type(private_data,
3988 struct control_gratious_arp);
3990 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3992 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3993 arp->iface, strerror(errno)));
3998 if (arp->count == CTDB_ARP_REPEAT) {
4003 tevent_add_timer(arp->ctdb->ev, arp,
4004 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4005 send_gratious_arp, arp);
4012 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4014 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4015 struct control_gratious_arp *arp;
4017 /* verify the size of indata */
4018 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4019 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4020 (unsigned)indata.dsize,
4021 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4025 ( offsetof(struct ctdb_addr_info_old, iface)
4026 + gratious_arp->len ) ){
4028 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4029 "but should be %u bytes\n",
4030 (unsigned)indata.dsize,
4031 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4036 arp = talloc(ctdb, struct control_gratious_arp);
4037 CTDB_NO_MEMORY(ctdb, arp);
4040 arp->addr = gratious_arp->addr;
4041 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4042 CTDB_NO_MEMORY(ctdb, arp->iface);
4045 tevent_add_timer(arp->ctdb->ev, arp,
4046 timeval_zero(), send_gratious_arp, arp);
4051 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4053 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4056 /* verify the size of indata */
4057 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4058 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4062 ( offsetof(struct ctdb_addr_info_old, iface)
4065 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4066 "but should be %u bytes\n",
4067 (unsigned)indata.dsize,
4068 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4072 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4074 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4077 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4084 struct delete_ip_callback_state {
4085 struct ctdb_req_control_old *c;
4089 called when releaseip event finishes for del_public_address
4091 static void delete_ip_callback(struct ctdb_context *ctdb,
4092 int32_t status, TDB_DATA data,
4093 const char *errormsg,
4096 struct delete_ip_callback_state *state =
4097 talloc_get_type(private_data, struct delete_ip_callback_state);
4099 /* If release failed then fail. */
4100 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4101 talloc_free(private_data);
4104 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4105 struct ctdb_req_control_old *c,
4106 TDB_DATA indata, bool *async_reply)
4108 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4109 struct ctdb_vnn *vnn;
4111 /* verify the size of indata */
4112 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4113 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4117 ( offsetof(struct ctdb_addr_info_old, iface)
4120 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4121 "but should be %u bytes\n",
4122 (unsigned)indata.dsize,
4123 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4127 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4129 /* walk over all public addresses until we find a match */
4130 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4131 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4132 if (vnn->pnn == ctdb->pnn) {
4133 struct delete_ip_callback_state *state;
4134 struct ctdb_public_ip *ip;
4138 vnn->delete_pending = true;
4140 state = talloc(ctdb,
4141 struct delete_ip_callback_state);
4142 CTDB_NO_MEMORY(ctdb, state);
4145 ip = talloc(state, struct ctdb_public_ip);
4148 (__location__ " Out of memory\n"));
4153 ip->addr = pub->addr;
4155 data.dsize = sizeof(struct ctdb_public_ip);
4156 data.dptr = (unsigned char *)ip;
4158 ret = ctdb_daemon_send_control(ctdb,
4161 CTDB_CONTROL_RELEASE_IP,
4168 (__location__ "Unable to send "
4169 "CTDB_CONTROL_RELEASE_IP\n"));
4174 state->c = talloc_steal(state, c);
4175 *async_reply = true;
4177 /* This IP is not hosted on the
4178 * current node so just delete it
4180 do_delete_ip(ctdb, vnn);
4187 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4188 ctdb_addr_to_str(&pub->addr)));
4193 struct ipreallocated_callback_state {
4194 struct ctdb_req_control_old *c;
4197 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4198 int status, void *p)
4200 struct ipreallocated_callback_state *state =
4201 talloc_get_type(p, struct ipreallocated_callback_state);
4205 (" \"ipreallocated\" event script failed (status %d)\n",
4207 if (status == -ETIME) {
4208 ctdb_ban_self(ctdb);
4212 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4216 /* A control to run the ipreallocated event */
4217 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4218 struct ctdb_req_control_old *c,
4222 struct ipreallocated_callback_state *state;
4224 state = talloc(ctdb, struct ipreallocated_callback_state);
4225 CTDB_NO_MEMORY(ctdb, state);
4227 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4229 ret = ctdb_event_script_callback(ctdb, state,
4230 ctdb_ipreallocated_callback, state,
4231 CTDB_EVENT_IPREALLOCATED,
4235 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4240 /* tell the control that we will be reply asynchronously */
4241 state->c = talloc_steal(state, c);
4242 *async_reply = true;
4248 /* This function is called from the recovery daemon to verify that a remote
4249 node has the expected ip allocation.
4250 This is verified against ctdb->ip_tree
4252 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4253 struct ctdb_public_ip_list_old *ips,
4256 struct public_ip_list *tmp_ip;
4259 if (ctdb->ip_tree == NULL) {
4260 /* don't know the expected allocation yet, assume remote node
4269 for (i=0; i<ips->num; i++) {
4270 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4271 if (tmp_ip == NULL) {
4272 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4276 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4280 if (tmp_ip->pnn != ips->ips[i].pnn) {
4282 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4284 ctdb_addr_to_str(&ips->ips[i].addr),
4285 ips->ips[i].pnn, tmp_ip->pnn));
4293 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4295 struct public_ip_list *tmp_ip;
4297 /* IP tree is never built if DisableIPFailover is set */
4298 if (ctdb->tunable.disable_ip_failover != 0) {
4302 if (ctdb->ip_tree == NULL) {
4303 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4307 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4308 if (tmp_ip == NULL) {
4309 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4313 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4314 tmp_ip->pnn = ip->pnn;
4319 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4321 TALLOC_FREE(ctdb->ip_tree);
4324 struct ctdb_reloadips_handle {
4325 struct ctdb_context *ctdb;
4326 struct ctdb_req_control_old *c;
4330 struct tevent_fd *fde;
4333 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4335 if (h == h->ctdb->reload_ips) {
4336 h->ctdb->reload_ips = NULL;
4339 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4342 ctdb_kill(h->ctdb, h->child, SIGKILL);
4346 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4347 struct tevent_timer *te,
4348 struct timeval t, void *private_data)
4350 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4355 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4356 struct tevent_fd *fde,
4357 uint16_t flags, void *private_data)
4359 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4364 ret = sys_read(h->fd[0], &res, 1);
4365 if (ret < 1 || res != 0) {
4366 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4374 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4376 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4377 struct ctdb_public_ip_list_old *ips;
4378 struct ctdb_vnn *vnn;
4379 struct client_async_data *async_data;
4380 struct timeval timeout;
4382 struct ctdb_client_control_state *state;
4386 CTDB_NO_MEMORY(ctdb, mem_ctx);
4388 /* Read IPs from local node */
4389 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4390 CTDB_CURRENT_NODE, mem_ctx, &ips);
4393 ("Unable to fetch public IPs from local node\n"));
4394 talloc_free(mem_ctx);
4398 /* Read IPs file - this is safe since this is a child process */
4400 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4401 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4402 talloc_free(mem_ctx);
4406 async_data = talloc_zero(mem_ctx, struct client_async_data);
4407 CTDB_NO_MEMORY(ctdb, async_data);
4409 /* Compare IPs between node and file for IPs to be deleted */
4410 for (i = 0; i < ips->num; i++) {
4412 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4413 if (ctdb_same_ip(&vnn->public_address,
4414 &ips->ips[i].addr)) {
4415 /* IP is still in file */
4421 /* Delete IP ips->ips[i] */
4422 struct ctdb_addr_info_old *pub;
4425 ("IP %s no longer configured, deleting it\n",
4426 ctdb_addr_to_str(&ips->ips[i].addr)));
4428 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4429 CTDB_NO_MEMORY(ctdb, pub);
4431 pub->addr = ips->ips[i].addr;
4435 timeout = TAKEOVER_TIMEOUT();
4437 data.dsize = offsetof(struct ctdb_addr_info_old,
4439 data.dptr = (uint8_t *)pub;
4441 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4442 CTDB_CONTROL_DEL_PUBLIC_IP,
4443 0, data, async_data,
4445 if (state == NULL) {
4448 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4452 ctdb_client_async_add(async_data, state);
4456 /* Compare IPs between node and file for IPs to be added */
4458 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4459 for (i = 0; i < ips->num; i++) {
4460 if (ctdb_same_ip(&vnn->public_address,
4461 &ips->ips[i].addr)) {
4462 /* IP already on node */
4466 if (i == ips->num) {
4467 /* Add IP ips->ips[i] */
4468 struct ctdb_addr_info_old *pub;
4469 const char *ifaces = NULL;
4474 ("New IP %s configured, adding it\n",
4475 ctdb_addr_to_str(&vnn->public_address)));
4477 uint32_t pnn = ctdb_get_pnn(ctdb);
4479 data.dsize = sizeof(pnn);
4480 data.dptr = (uint8_t *)&pnn;
4482 ret = ctdb_client_send_message(
4484 CTDB_BROADCAST_CONNECTED,
4485 CTDB_SRVID_REBALANCE_NODE,
4488 DEBUG(DEBUG_WARNING,
4489 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4495 ifaces = vnn->ifaces[0];
4497 while (vnn->ifaces[iface] != NULL) {
4498 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4499 vnn->ifaces[iface]);
4503 len = strlen(ifaces) + 1;
4504 pub = talloc_zero_size(mem_ctx,
4505 offsetof(struct ctdb_addr_info_old, iface) + len);
4506 CTDB_NO_MEMORY(ctdb, pub);
4508 pub->addr = vnn->public_address;
4509 pub->mask = vnn->public_netmask_bits;
4511 memcpy(&pub->iface[0], ifaces, pub->len);
4513 timeout = TAKEOVER_TIMEOUT();
4515 data.dsize = offsetof(struct ctdb_addr_info_old,
4517 data.dptr = (uint8_t *)pub;
4519 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4520 CTDB_CONTROL_ADD_PUBLIC_IP,
4521 0, data, async_data,
4523 if (state == NULL) {
4526 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4530 ctdb_client_async_add(async_data, state);
4534 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4535 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4539 talloc_free(mem_ctx);
4543 talloc_free(mem_ctx);
4547 /* This control is sent to force the node to re-read the public addresses file
4548 and drop any addresses we should nnot longer host, and add new addresses
4549 that we are now able to host
4551 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4553 struct ctdb_reloadips_handle *h;
4554 pid_t parent = getpid();
4556 if (ctdb->reload_ips != NULL) {
4557 talloc_free(ctdb->reload_ips);
4558 ctdb->reload_ips = NULL;
4561 h = talloc(ctdb, struct ctdb_reloadips_handle);
4562 CTDB_NO_MEMORY(ctdb, h);
4567 if (pipe(h->fd) == -1) {
4568 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4573 h->child = ctdb_fork(ctdb);
4574 if (h->child == (pid_t)-1) {
4575 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4583 if (h->child == 0) {
4584 signed char res = 0;
4587 debug_extra = talloc_asprintf(NULL, "reloadips:");
4589 prctl_set_comment("ctdb_reloadips");
4590 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4591 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4594 res = ctdb_reloadips_child(ctdb);
4596 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4600 sys_write(h->fd[1], &res, 1);
4601 /* make sure we die when our parent dies */
4602 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4608 h->c = talloc_steal(h, c);
4611 set_close_on_exec(h->fd[0]);
4613 talloc_set_destructor(h, ctdb_reloadips_destructor);
4616 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4617 ctdb_reloadips_child_handler, (void *)h);
4618 tevent_fd_set_auto_close(h->fde);
4620 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4621 ctdb_reloadips_timeout_event, h);
4623 /* we reply later */
4624 *async_reply = true;