4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 enum ipalloc_algorithm {
57 IPALLOC_DETERMINISTIC,
58 IPALLOC_NONDETERMINISTIC,
62 struct ipalloc_state {
65 /* Arrays with data for each node */
66 struct ctdb_public_ip_list_old **known_public_ips;
67 struct ctdb_public_ip_list_old **available_public_ips;
69 enum ipalloc_algorithm algorithm;
70 uint32_t no_ip_failback;
73 struct ctdb_interface {
74 struct ctdb_interface *prev, *next;
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
83 return vnn->iface->name;
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 struct ctdb_interface *i;
93 /* Verify that we don't have an entry for this ip yet */
94 for (i=ctdb->ifaces;i;i=i->next) {
95 if (strcmp(i->name, iface) == 0) {
100 /* create a new structure for this interface */
101 i = talloc_zero(ctdb, struct ctdb_interface);
102 CTDB_NO_MEMORY_FATAL(ctdb, i);
103 i->name = talloc_strdup(i, iface);
104 CTDB_NO_MEMORY(ctdb, i->name);
108 DLIST_ADD(ctdb->ifaces, i);
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
118 for (n = 0; vnn->ifaces[n] != NULL; n++) {
119 if (strcmp(name, vnn->ifaces[n]) == 0) {
127 /* If any interfaces now have no possible IPs then delete them. This
128 * implementation is naive (i.e. simple) rather than clever
129 * (i.e. complex). Given that this is run on delip and that operation
130 * is rare, this doesn't need to be efficient - it needs to be
131 * foolproof. One alternative is reference counting, where the logic
132 * is distributed and can, therefore, be broken in multiple places.
133 * Another alternative is to build a red-black tree of interfaces that
134 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135 * once) and then walking ctdb->ifaces once and deleting those not in
136 * the tree. Let's go to one of those if the naive implementation
137 * causes problems... :-)
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140 struct ctdb_vnn *vnn)
142 struct ctdb_interface *i, *next;
144 /* For each interface, check if there's an IP using it. */
145 for (i = ctdb->ifaces; i != NULL; i = next) {
150 /* Only consider interfaces named in the given VNN. */
151 if (!vnn_has_interface_with_name(vnn, i->name)) {
155 /* Is the "single IP" on this interface? */
156 if ((ctdb->single_ip_vnn != NULL) &&
157 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159 /* Found, next interface please... */
162 /* Search for a vnn with this interface. */
164 for (tv=ctdb->vnn; tv; tv=tv->next) {
165 if (vnn_has_interface_with_name(tv, i->name)) {
172 /* None of the VNNs are using this interface. */
173 DLIST_REMOVE(ctdb->ifaces, i);
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
183 struct ctdb_interface *i;
185 for (i=ctdb->ifaces;i;i=i->next) {
186 if (strcmp(i->name, iface) == 0) {
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195 struct ctdb_vnn *vnn)
198 struct ctdb_interface *cur = NULL;
199 struct ctdb_interface *best = NULL;
201 for (i=0; vnn->ifaces[i]; i++) {
203 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
217 if (cur->references < best->references) {
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227 struct ctdb_vnn *vnn)
229 struct ctdb_interface *best = NULL;
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "still assigned to iface '%s'\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn)));
239 best = ctdb_vnn_best_iface(ctdb, vnn);
241 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242 "cannot assign to iface any iface\n",
243 ctdb_addr_to_str(&vnn->public_address)));
249 vnn->pnn = ctdb->pnn;
251 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252 "now assigned to iface '%s' refs[%d]\n",
253 ctdb_addr_to_str(&vnn->public_address),
254 ctdb_vnn_iface_string(vnn),
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
262 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263 "now unassigned (old iface '%s' refs[%d])\n",
264 ctdb_addr_to_str(&vnn->public_address),
265 ctdb_vnn_iface_string(vnn),
266 vnn->iface?vnn->iface->references:0));
268 vnn->iface->references--;
271 if (vnn->pnn == ctdb->pnn) {
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
281 /* Nodes that are not RUNNING can not host IPs */
282 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
286 if (vnn->delete_pending) {
290 if (vnn->iface && vnn->iface->link_up) {
294 for (i=0; vnn->ifaces[i]; i++) {
295 struct ctdb_interface *cur;
297 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
310 struct ctdb_takeover_arp {
311 struct ctdb_context *ctdb;
314 struct ctdb_tcp_array *tcparray;
315 struct ctdb_vnn *vnn;
320 lists of tcp endpoints
322 struct ctdb_tcp_list {
323 struct ctdb_tcp_list *prev, *next;
324 struct ctdb_connection connection;
328 list of clients to kill on IP release
330 struct ctdb_client_ip {
331 struct ctdb_client_ip *prev, *next;
332 struct ctdb_context *ctdb;
339 send a gratuitous arp
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342 struct tevent_timer *te,
343 struct timeval t, void *private_data)
345 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
346 struct ctdb_takeover_arp);
348 struct ctdb_tcp_array *tcparray;
349 const char *iface = ctdb_vnn_iface_string(arp->vnn);
351 ret = ctdb_sys_send_arp(&arp->addr, iface);
353 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354 iface, strerror(errno)));
357 tcparray = arp->tcparray;
359 for (i=0;i<tcparray->num;i++) {
360 struct ctdb_connection *tcon;
362 tcon = &tcparray->connections[i];
363 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364 (unsigned)ntohs(tcon->dst.ip.sin_port),
365 ctdb_addr_to_str(&tcon->src),
366 (unsigned)ntohs(tcon->src.ip.sin_port)));
367 ret = ctdb_sys_send_tcp(
372 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373 ctdb_addr_to_str(&tcon->src)));
380 if (arp->count == CTDB_ARP_REPEAT) {
385 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387 ctdb_control_send_arp, arp);
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391 struct ctdb_vnn *vnn)
393 struct ctdb_takeover_arp *arp;
394 struct ctdb_tcp_array *tcparray;
396 if (!vnn->takeover_ctx) {
397 vnn->takeover_ctx = talloc_new(vnn);
398 if (!vnn->takeover_ctx) {
403 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
409 arp->addr = vnn->public_address;
412 tcparray = vnn->tcp_array;
414 /* add all of the known tcp connections for this IP to the
415 list of tcp connections to send tickle acks for */
416 arp->tcparray = talloc_steal(arp, tcparray);
418 vnn->tcp_array = NULL;
419 vnn->tcp_update_needed = true;
422 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423 timeval_zero(), ctdb_control_send_arp, arp);
428 struct takeover_callback_state {
429 struct ctdb_req_control_old *c;
430 ctdb_sock_addr *addr;
431 struct ctdb_vnn *vnn;
434 struct ctdb_do_takeip_state {
435 struct ctdb_req_control_old *c;
436 struct ctdb_vnn *vnn;
440 called when takeip event finishes
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
445 struct ctdb_do_takeip_state *state =
446 talloc_get_type(private_data, struct ctdb_do_takeip_state);
451 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453 if (status == -ETIME) {
456 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457 ctdb_addr_to_str(&state->vnn->public_address),
458 ctdb_vnn_iface_string(state->vnn)));
459 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461 node->flags |= NODE_FLAGS_UNHEALTHY;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
539 ctdb_do_takeip_callback,
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
575 if (status == -ETIME) {
578 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579 ctdb_addr_to_str(&state->vnn->public_address),
581 ctdb_vnn_iface_string(state->vnn)));
584 * All we can do is reset the old interface
585 * and let the next run fix it
587 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588 state->vnn->iface = state->old;
589 state->vnn->iface->references++;
591 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
596 if (ctdb->do_checkpublicip) {
598 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
607 /* the control succeeded */
608 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 state->vnn->update_in_flight = false;
620 update (move) an ip address
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623 struct ctdb_req_control_old *c,
624 struct ctdb_vnn *vnn)
627 struct ctdb_do_updateip_state *state;
628 struct ctdb_interface *old = vnn->iface;
629 const char *new_name;
631 if (vnn->update_in_flight) {
632 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633 "update for this IP already in flight\n",
634 ctdb_addr_to_str(&vnn->public_address),
635 vnn->public_netmask_bits));
639 ctdb_vnn_unassign_iface(ctdb, vnn);
640 ret = ctdb_vnn_assign_iface(ctdb, vnn);
642 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643 "assin a usable interface (old iface '%s')\n",
644 ctdb_addr_to_str(&vnn->public_address),
645 vnn->public_netmask_bits,
650 new_name = ctdb_vnn_iface_string(vnn);
651 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652 /* A benign update from one interface onto itself.
653 * no need to run the eventscripts in this case, just return
656 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 state = talloc(vnn, struct ctdb_do_updateip_state);
661 CTDB_NO_MEMORY(ctdb, state);
663 state->c = talloc_steal(ctdb, c);
667 vnn->update_in_flight = true;
668 talloc_set_destructor(state, ctdb_updateip_destructor);
670 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671 "interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 vnn->public_netmask_bits,
677 ret = ctdb_event_script_callback(ctdb,
679 ctdb_do_updateip_callback,
681 CTDB_EVENT_UPDATE_IP,
685 ctdb_addr_to_str(&vnn->public_address),
686 vnn->public_netmask_bits);
688 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689 ctdb_addr_to_str(&vnn->public_address),
690 old->name, new_name));
699 Find the vnn of the node that has a public ip address
700 returns -1 if the address is not known as a public address
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 struct ctdb_vnn *vnn;
706 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707 if (ctdb_same_ip(&vnn->public_address, addr)) {
716 take over an ip address
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719 struct ctdb_req_control_old *c,
724 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725 struct ctdb_vnn *vnn;
726 bool have_ip = false;
727 bool do_updateip = false;
728 bool do_takeip = false;
729 struct ctdb_interface *best_iface = NULL;
731 if (pip->pnn != ctdb->pnn) {
732 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733 "with pnn %d, but we're node %d\n",
734 ctdb_addr_to_str(&pip->addr),
735 pip->pnn, ctdb->pnn));
739 /* update out vnn list */
740 vnn = find_public_ip_vnn(ctdb, &pip->addr);
742 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743 ctdb_addr_to_str(&pip->addr)));
747 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748 have_ip = ctdb_sys_have_ip(&pip->addr);
750 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751 if (best_iface == NULL) {
752 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753 "a usable interface (old %s, have_ip %d)\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 vnn->public_netmask_bits,
756 ctdb_vnn_iface_string(vnn),
761 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
767 if (vnn->iface == NULL && have_ip) {
768 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770 ctdb_addr_to_str(&vnn->public_address)));
774 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776 "and we have it on iface[%s], but it was assigned to node %d"
777 "and we are node %d, banning ourself\n",
778 ctdb_addr_to_str(&vnn->public_address),
779 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
784 if (vnn->pnn == -1 && have_ip) {
785 vnn->pnn = ctdb->pnn;
786 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787 "and we already have it on iface[%s], update local daemon\n",
788 ctdb_addr_to_str(&vnn->public_address),
789 ctdb_vnn_iface_string(vnn)));
794 if (vnn->iface != best_iface) {
795 if (!vnn->iface->link_up) {
797 } else if (vnn->iface->references > (best_iface->references + 1)) {
798 /* only move when the rebalance gains something */
806 ctdb_vnn_unassign_iface(ctdb, vnn);
813 ret = ctdb_do_takeip(ctdb, c, vnn);
817 } else if (do_updateip) {
818 ret = ctdb_do_updateip(ctdb, c, vnn);
824 * The interface is up and the kernel known the ip
827 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828 ctdb_addr_to_str(&pip->addr),
829 vnn->public_netmask_bits,
830 ctdb_vnn_iface_string(vnn)));
834 /* tell ctdb_control.c that we will be replying asynchronously */
841 kill any clients that are registered with a IP that is being released
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 struct ctdb_client_ip *ip;
847 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848 ctdb_addr_to_str(addr)));
850 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851 ctdb_sock_addr tmp_addr;
854 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
856 ctdb_addr_to_str(&ip->addr)));
858 if (ctdb_same_ip(&tmp_addr, addr)) {
859 struct ctdb_client *client = reqid_find(ctdb->idr,
862 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
864 ctdb_addr_to_str(&ip->addr),
867 if (client->pid != 0) {
868 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869 (unsigned)client->pid,
870 ctdb_addr_to_str(addr),
872 kill(client->pid, SIGKILL);
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 DLIST_REMOVE(ctdb->vnn, vnn);
881 ctdb_vnn_unassign_iface(ctdb, vnn);
882 ctdb_remove_orphaned_ifaces(ctdb, vnn);
887 called when releaseip event finishes
889 static void release_ip_callback(struct ctdb_context *ctdb, int status,
892 struct takeover_callback_state *state =
893 talloc_get_type(private_data, struct takeover_callback_state);
896 if (status == -ETIME) {
900 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901 if (ctdb_sys_have_ip(state->addr)) {
903 ("IP %s still hosted during release IP callback, failing\n",
904 ctdb_addr_to_str(state->addr)));
905 ctdb_request_control_reply(ctdb, state->c,
912 /* send a message to all clients of this node telling them
913 that the cluster has been reconfigured and they should
914 release any sockets on this IP */
915 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917 data.dsize = strlen((char *)data.dptr)+1;
919 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923 /* kill clients that have registered with this IP */
924 release_kill_clients(ctdb, state->addr);
926 ctdb_vnn_unassign_iface(ctdb, state->vnn);
928 /* Process the IP if it has been marked for deletion */
929 if (state->vnn->delete_pending) {
930 do_delete_ip(ctdb, state->vnn);
934 /* the control succeeded */
935 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 if (state->vnn != NULL) {
942 state->vnn->update_in_flight = false;
948 release an ip address
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
951 struct ctdb_req_control_old *c,
956 struct takeover_callback_state *state;
957 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958 struct ctdb_vnn *vnn;
961 /* update our vnn list */
962 vnn = find_public_ip_vnn(ctdb, &pip->addr);
964 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965 ctdb_addr_to_str(&pip->addr)));
970 /* stop any previous arps */
971 talloc_free(vnn->takeover_ctx);
972 vnn->takeover_ctx = NULL;
974 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975 * lazy multicast to drop an IP from any node that isn't the
976 * intended new node. The following causes makes ctdbd ignore
977 * a release for any address it doesn't host.
979 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980 if (!ctdb_sys_have_ip(&pip->addr)) {
981 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982 ctdb_addr_to_str(&pip->addr),
983 vnn->public_netmask_bits,
984 ctdb_vnn_iface_string(vnn)));
985 ctdb_vnn_unassign_iface(ctdb, vnn);
989 if (vnn->iface == NULL) {
990 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991 ctdb_addr_to_str(&pip->addr),
992 vnn->public_netmask_bits));
997 /* There is a potential race between take_ip and us because we
998 * update the VNN via a callback that run when the
999 * eventscripts have been run. Avoid the race by allowing one
1000 * update to be in flight at a time.
1002 if (vnn->update_in_flight) {
1003 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004 "update for this IP already in flight\n",
1005 ctdb_addr_to_str(&vnn->public_address),
1006 vnn->public_netmask_bits));
1010 iface = strdup(ctdb_vnn_iface_string(vnn));
1012 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits,
1018 state = talloc(ctdb, struct takeover_callback_state);
1019 if (state == NULL) {
1020 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021 __FILE__, __LINE__);
1026 state->c = talloc_steal(state, c);
1027 state->addr = talloc(state, ctdb_sock_addr);
1028 if (state->addr == NULL) {
1029 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030 __FILE__, __LINE__);
1035 *state->addr = pip->addr;
1038 vnn->update_in_flight = true;
1039 talloc_set_destructor(state, ctdb_releaseip_destructor);
1041 ret = ctdb_event_script_callback(ctdb,
1042 state, release_ip_callback, state,
1043 CTDB_EVENT_RELEASE_IP,
1046 ctdb_addr_to_str(&pip->addr),
1047 vnn->public_netmask_bits);
1050 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051 ctdb_addr_to_str(&pip->addr),
1052 ctdb_vnn_iface_string(vnn)));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063 ctdb_sock_addr *addr,
1064 unsigned mask, const char *ifaces,
1067 struct ctdb_vnn *vnn;
1074 tmp = strdup(ifaces);
1075 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076 if (!ctdb_sys_check_iface_exists(iface)) {
1077 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1084 /* Verify that we don't have an entry for this ip yet */
1085 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1088 ctdb_addr_to_str(addr)));
1093 /* create a new vnn structure for this ip address */
1094 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097 tmp = talloc_strdup(vnn, ifaces);
1098 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1107 vnn->ifaces[num] = NULL;
1108 vnn->public_address = *addr;
1109 vnn->public_netmask_bits = mask;
1111 if (check_address) {
1112 if (ctdb_sys_have_ip(addr)) {
1113 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114 vnn->pnn = ctdb->pnn;
1118 for (i=0; vnn->ifaces[i]; i++) {
1119 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122 "for public_address[%s]\n",
1123 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1129 DLIST_ADD(ctdb->vnn, vnn);
1135 setup the public address lists from a file
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1143 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144 if (lines == NULL) {
1145 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1148 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1152 for (i=0;i<nlines;i++) {
1154 ctdb_sock_addr addr;
1155 const char *addrstr;
1160 while ((*line == ' ') || (*line == '\t')) {
1166 if (strcmp(line, "") == 0) {
1169 tok = strtok(line, " \t");
1171 tok = strtok(NULL, " \t");
1173 if (NULL == ctdb->default_public_interface) {
1174 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1179 ifaces = ctdb->default_public_interface;
1184 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1189 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1205 struct ctdb_vnn *svnn;
1206 struct ctdb_interface *cur = NULL;
1210 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211 CTDB_NO_MEMORY(ctdb, svnn);
1213 svnn->ifaces = talloc_array(svnn, const char *, 2);
1214 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217 svnn->ifaces[1] = NULL;
1219 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1225 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228 "for single_ip[%s]\n",
1230 ctdb_addr_to_str(&svnn->public_address)));
1235 /* assume the single public ip interface is initially "good" */
1236 cur = ctdb_find_iface(ctdb, iface);
1238 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1241 cur->link_up = true;
1243 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1249 ctdb->single_ip_vnn = svnn;
1253 struct public_ip_list {
1254 struct public_ip_list *next;
1256 ctdb_sock_addr addr;
1259 /* Given a physical node, return the number of
1260 public addresses that is currently assigned to this node.
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1266 for (;ips;ips=ips->next) {
1267 if (ips->pnn == pnn) {
1275 /* Can the given node host the given IP: is the public IP known to the
1276 * node and is NOIPHOST unset?
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1280 struct ctdb_ipflags ipflags,
1281 struct public_ip_list *ip)
1283 struct ctdb_public_ip_list_old *public_ips;
1286 if (ipflags.noiphost) {
1290 public_ips = ipalloc_state->available_public_ips[pnn];
1292 if (public_ips == NULL) {
1296 for (i=0; i<public_ips->num; i++) {
1297 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298 /* yes, this node can serve this public ip */
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1308 struct ctdb_ipflags ipflags,
1309 struct public_ip_list *ip)
1311 if (ipflags.noiptakeover) {
1315 return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1318 /* search the node lists list for a node to takeover this ip.
1319 pick the node that currently are serving the least number of ips
1320 so that the ips get spread out evenly.
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323 struct ctdb_ipflags *ipflags,
1324 struct public_ip_list *ip,
1325 struct public_ip_list *all_ips)
1327 int pnn, min=0, num;
1330 numnodes = talloc_array_length(ipflags);
1332 for (i=0; i<numnodes; i++) {
1333 /* verify that this node can serve this ip */
1334 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335 /* no it couldnt so skip to the next node */
1339 num = node_ip_coverage(i, all_ips);
1340 /* was this the first node we checked ? */
1352 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353 ctdb_addr_to_str(&ip->addr)));
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1365 static uint32_t key[IP_KEYLEN];
1367 bzero(key, sizeof(key));
1369 switch (ip->sa.sa_family) {
1371 key[3] = htonl(ip->ip.sin_addr.s_addr);
1374 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375 key[0] = htonl(s6_a32[0]);
1376 key[1] = htonl(s6_a32[1]);
1377 key[2] = htonl(s6_a32[2]);
1378 key[3] = htonl(s6_a32[3]);
1382 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1389 static void *add_ip_callback(void *parm, void *data)
1391 struct public_ip_list *this_ip = parm;
1392 struct public_ip_list *prev_ip = data;
1394 if (prev_ip == NULL) {
1397 if (this_ip->pnn == -1) {
1398 this_ip->pnn = prev_ip->pnn;
1404 static int getips_count_callback(void *param, void *data)
1406 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1409 new_ip->next = *ip_list;
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415 struct ctdb_public_ip_list_old *ips,
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419 struct ipalloc_state *ipalloc_state,
1420 struct ctdb_node_map_old *nodemap)
1425 if (ipalloc_state->num != nodemap->num) {
1428 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429 ipalloc_state->num, nodemap->num));
1433 for (j=0; j<nodemap->num; j++) {
1434 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1438 /* Retrieve the list of known public IPs from the node */
1439 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1444 &ipalloc_state->known_public_ips[j]);
1447 ("Failed to read known public IPs from node: %u\n",
1452 if (ctdb->do_checkpublicip) {
1453 verify_remote_ip_allocation(ctdb,
1454 ipalloc_state->known_public_ips[j],
1458 /* Retrieve the list of available public IPs from the node */
1459 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1463 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464 &ipalloc_state->available_public_ips[j]);
1467 ("Failed to read available public IPs from node: %u\n",
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1480 struct public_ip_list *ip_list;
1481 struct ctdb_public_ip_list_old *public_ips;
1483 TALLOC_FREE(ctdb->ip_tree);
1484 ctdb->ip_tree = trbt_create(ctdb, 0);
1486 for (i=0; i < ctdb->num_nodes; i++) {
1487 public_ips = ipalloc_state->known_public_ips[i];
1489 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1493 /* there were no public ips for this node */
1494 if (public_ips == NULL) {
1498 for (j=0; j < public_ips->num; j++) {
1499 struct public_ip_list *tmp_ip;
1501 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503 /* Do not use information about IP addresses hosted
1504 * on other nodes, it may not be accurate */
1505 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506 tmp_ip->pnn = public_ips->ips[j].pnn;
1510 tmp_ip->addr = public_ips->ips[j].addr;
1511 tmp_ip->next = NULL;
1513 trbt_insertarray32_callback(ctdb->ip_tree,
1514 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1521 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1527 * This is the length of the longtest common prefix between the IPs.
1528 * It is calculated by XOR-ing the 2 IPs together and counting the
1529 * number of leading zeroes. The implementation means that all
1530 * addresses end up being 128 bits long.
1532 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534 * lots of nodes and IP addresses?
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1538 uint32_t ip1_k[IP_KEYLEN];
1543 uint32_t distance = 0;
1545 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1547 for (i=0; i<IP_KEYLEN; i++) {
1548 x = ip1_k[i] ^ t[i];
1552 /* Count number of leading zeroes.
1553 * FIXME? This could be optimised...
1555 while ((x & (1 << 31)) == 0) {
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566 given node. The ips argument is generally the all_ips variable
1567 used in the main part of the algorithm.
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570 struct public_ip_list *ips,
1573 struct public_ip_list *t;
1578 for (t=ips; t != NULL; t=t->next) {
1579 if (t->pnn != pnn) {
1583 /* Optimisation: We never calculate the distance
1584 * between an address and itself. This allows us to
1585 * calculate the effect of removing an address from a
1586 * node by simply calculating the distance between
1587 * that address and all of the exitsing addresses.
1588 * Moreover, we assume that we're only ever dealing
1589 * with addresses from all_ips so we can identify an
1590 * address via a pointer rather than doing a more
1591 * expensive address comparison. */
1592 if (&(t->addr) == ip) {
1596 d = ip_distance(ip, &(t->addr));
1597 sum += d * d; /* Cheaper than pulling in math.h :-) */
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1608 struct public_ip_list *t;
1610 uint32_t imbalance = 0;
1612 for (t=all_ips; t!=NULL; t=t->next) {
1613 if (t->pnn != pnn) {
1616 /* Pass the rest of the IPs rather than the whole
1619 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626 * finding the best node for each.
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629 struct ctdb_ipflags *ipflags,
1630 struct public_ip_list *all_ips)
1632 struct public_ip_list *tmp_ip;
1634 /* loop over all ip's and find a physical node to cover for
1637 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638 if (tmp_ip->pnn == -1) {
1639 if (find_takeover_node(ipalloc_state, ipflags,
1641 DEBUG(DEBUG_WARNING,
1642 ("Failed to find node to cover ip %s\n",
1643 ctdb_addr_to_str(&tmp_ip->addr)));
1649 /* Basic non-deterministic rebalancing algorithm.
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652 struct ctdb_ipflags *ipflags,
1653 struct public_ip_list *all_ips,
1657 int maxnode, maxnum, minnode, minnum, num, retries;
1658 struct public_ip_list *tmp_ip;
1660 numnodes = talloc_array_length(ipflags);
1667 /* for each ip address, loop over all nodes that can serve
1668 this ip and make sure that the difference between the node
1669 serving the most and the node serving the least ip's are
1672 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673 if (tmp_ip->pnn == -1) {
1677 /* Get the highest and lowest number of ips's served by any
1678 valid node which can serve this ip.
1682 for (i=0; i<numnodes; i++) {
1683 /* only check nodes that can actually serve this ip */
1684 if (!can_node_takeover_ip(ipalloc_state, i,
1685 ipflags[i], tmp_ip)) {
1686 /* no it couldnt so skip to the next node */
1690 num = node_ip_coverage(i, all_ips);
1691 if (maxnode == -1) {
1700 if (minnode == -1) {
1710 if (maxnode == -1) {
1711 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712 ctdb_addr_to_str(&tmp_ip->addr)));
1717 /* if the spread between the smallest and largest coverage by
1718 a node is >=2 we steal one of the ips from the node with
1719 most coverage to even things out a bit.
1720 try to do this a limited number of times since we dont
1721 want to spend too much time balancing the ip coverage.
1723 if ( (maxnum > minnum+1)
1724 && (retries < (num_ips + 5)) ){
1725 struct public_ip_list *tmp;
1727 /* Reassign one of maxnode's VNNs */
1728 for (tmp=all_ips;tmp;tmp=tmp->next) {
1729 if (tmp->pnn == maxnode) {
1730 (void)find_takeover_node(ipalloc_state,
1742 static bool lcp2_init(TALLOC_CTX *tmp_ctx,
1743 struct ctdb_ipflags *ipflags,
1744 struct public_ip_list *all_ips,
1745 uint32_t *force_rebalance_nodes,
1746 uint32_t **lcp2_imbalances,
1747 bool **rebalance_candidates)
1750 struct public_ip_list *tmp_ip;
1752 numnodes = talloc_array_length(ipflags);
1754 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1755 if (*rebalance_candidates == NULL) {
1756 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1759 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1760 if (*lcp2_imbalances == NULL) {
1761 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1765 for (i=0; i<numnodes; i++) {
1766 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1767 /* First step: assume all nodes are candidates */
1768 (*rebalance_candidates)[i] = true;
1771 /* 2nd step: if a node has IPs assigned then it must have been
1772 * healthy before, so we remove it from consideration. This
1773 * is overkill but is all we have because we don't maintain
1774 * state between takeover runs. An alternative would be to
1775 * keep state and invalidate it every time the recovery master
1778 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1779 if (tmp_ip->pnn != -1) {
1780 (*rebalance_candidates)[tmp_ip->pnn] = false;
1784 /* 3rd step: if a node is forced to re-balance then
1785 we allow failback onto the node */
1786 if (force_rebalance_nodes == NULL) {
1789 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1790 uint32_t pnn = force_rebalance_nodes[i];
1791 if (pnn >= numnodes) {
1793 (__location__ "unknown node %u\n", pnn));
1798 ("Forcing rebalancing of IPs to node %u\n", pnn));
1799 (*rebalance_candidates)[pnn] = true;
1805 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1806 * the IP/node combination that will cost the least.
1808 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1809 struct ctdb_ipflags *ipflags,
1810 struct public_ip_list *all_ips,
1811 uint32_t *lcp2_imbalances)
1813 struct public_ip_list *tmp_ip;
1814 int dstnode, numnodes;
1817 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1818 struct public_ip_list *minip;
1820 bool should_loop = true;
1821 bool have_unassigned = true;
1823 numnodes = talloc_array_length(ipflags);
1825 while (have_unassigned && should_loop) {
1826 should_loop = false;
1828 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1829 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1835 /* loop over each unassigned ip. */
1836 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1837 if (tmp_ip->pnn != -1) {
1841 for (dstnode=0; dstnode<numnodes; dstnode++) {
1842 /* only check nodes that can actually takeover this ip */
1843 if (!can_node_takeover_ip(ipalloc_state,
1847 /* no it couldnt so skip to the next node */
1851 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1854 ctdb_addr_to_str(&(tmp_ip->addr)),
1856 dstimbl - lcp2_imbalances[dstnode]));
1859 if ((minnode == -1) || (dstdsum < mindsum)) {
1869 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1871 /* If we found one then assign it to the given node. */
1872 if (minnode != -1) {
1873 minip->pnn = minnode;
1874 lcp2_imbalances[minnode] = minimbl;
1875 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1876 ctdb_addr_to_str(&(minip->addr)),
1881 /* There might be a better way but at least this is clear. */
1882 have_unassigned = false;
1883 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1884 if (tmp_ip->pnn == -1) {
1885 have_unassigned = true;
1890 /* We know if we have an unassigned addresses so we might as
1893 if (have_unassigned) {
1894 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1895 if (tmp_ip->pnn == -1) {
1896 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1897 ctdb_addr_to_str(&tmp_ip->addr)));
1903 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1904 * to move IPs from, determines the best IP/destination node
1905 * combination to move from the source node.
1907 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1908 struct ctdb_ipflags *ipflags,
1909 struct public_ip_list *all_ips,
1911 uint32_t *lcp2_imbalances,
1912 bool *rebalance_candidates)
1914 int dstnode, mindstnode, numnodes;
1915 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1916 uint32_t minsrcimbl, mindstimbl;
1917 struct public_ip_list *minip;
1918 struct public_ip_list *tmp_ip;
1920 /* Find an IP and destination node that best reduces imbalance. */
1927 numnodes = talloc_array_length(ipflags);
1929 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1930 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1931 srcnode, lcp2_imbalances[srcnode]));
1933 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1934 /* Only consider addresses on srcnode. */
1935 if (tmp_ip->pnn != srcnode) {
1939 /* What is this IP address costing the source node? */
1940 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1941 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1943 /* Consider this IP address would cost each potential
1944 * destination node. Destination nodes are limited to
1945 * those that are newly healthy, since we don't want
1946 * to do gratuitous failover of IPs just to make minor
1947 * balance improvements.
1949 for (dstnode=0; dstnode<numnodes; dstnode++) {
1950 if (!rebalance_candidates[dstnode]) {
1954 /* only check nodes that can actually takeover this ip */
1955 if (!can_node_takeover_ip(ipalloc_state, dstnode,
1956 ipflags[dstnode], tmp_ip)) {
1957 /* no it couldnt so skip to the next node */
1961 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1962 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1963 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1965 ctdb_addr_to_str(&(tmp_ip->addr)),
1968 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1969 (dstdsum < srcdsum) && \
1970 ((mindstnode == -1) || \
1971 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1974 minsrcimbl = srcimbl;
1975 mindstnode = dstnode;
1976 mindstimbl = dstimbl;
1980 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1982 if (mindstnode != -1) {
1983 /* We found a move that makes things better... */
1984 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1985 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1986 ctdb_addr_to_str(&(minip->addr)),
1987 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1990 lcp2_imbalances[srcnode] = minsrcimbl;
1991 lcp2_imbalances[mindstnode] = mindstimbl;
1992 minip->pnn = mindstnode;
2001 struct lcp2_imbalance_pnn {
2006 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2008 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2009 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2011 if (lipa->imbalance > lipb->imbalance) {
2013 } else if (lipa->imbalance == lipb->imbalance) {
2020 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2021 * node with the highest LCP2 imbalance, and then determines the best
2022 * IP/destination node combination to move from the source node.
2024 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2025 struct ctdb_ipflags *ipflags,
2026 struct public_ip_list *all_ips,
2027 uint32_t *lcp2_imbalances,
2028 bool *rebalance_candidates)
2031 struct lcp2_imbalance_pnn * lips;
2034 numnodes = talloc_array_length(ipflags);
2037 /* Put the imbalances and nodes into an array, sort them and
2038 * iterate through candidates. Usually the 1st one will be
2039 * used, so this doesn't cost much...
2041 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2042 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2043 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2044 for (i=0; i<numnodes; i++) {
2045 lips[i].imbalance = lcp2_imbalances[i];
2047 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2049 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2050 lcp2_cmp_imbalance_pnn);
2053 for (i=0; i<numnodes; i++) {
2054 /* This means that all nodes had 0 or 1 addresses, so
2055 * can't be imbalanced.
2057 if (lips[i].imbalance == 0) {
2061 if (lcp2_failback_candidate(ipalloc_state,
2066 rebalance_candidates)) {
2078 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2079 struct ctdb_ipflags *ipflags,
2080 struct public_ip_list *all_ips)
2082 struct public_ip_list *tmp_ip;
2084 /* verify that the assigned nodes can serve that public ip
2085 and set it to -1 if not
2087 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2088 if (tmp_ip->pnn == -1) {
2091 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2092 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2093 /* this node can not serve this ip. */
2094 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2095 ctdb_addr_to_str(&(tmp_ip->addr)),
2102 static bool ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2103 struct ctdb_ipflags *ipflags,
2104 struct public_ip_list *all_ips)
2106 struct public_ip_list *tmp_ip;
2109 numnodes = talloc_array_length(ipflags);
2111 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2112 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2113 * always be allocated the same way for a specific set of
2114 * available/unavailable nodes.
2117 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2118 tmp_ip->pnn = i % numnodes;
2121 /* IP failback doesn't make sense with deterministic
2122 * IPs, since the modulo step above implicitly fails
2123 * back IPs to their "home" node.
2125 if (1 == ipalloc_state->no_ip_failback) {
2126 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2129 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2131 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2133 /* No failback here! */
2138 static bool ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2139 struct ctdb_ipflags *ipflags,
2140 struct public_ip_list *all_ips)
2142 /* This should be pushed down into basic_failback. */
2143 struct public_ip_list *tmp_ip;
2145 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2149 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2151 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2153 /* If we don't want IPs to fail back then don't rebalance IPs. */
2154 if (1 == ipalloc_state->no_ip_failback) {
2158 /* Now, try to make sure the ip adresses are evenly distributed
2161 basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2166 static bool ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2167 struct ctdb_ipflags *ipflags,
2168 struct public_ip_list *all_ips,
2169 uint32_t *force_rebalance_nodes)
2171 uint32_t *lcp2_imbalances;
2172 bool *rebalance_candidates;
2173 int numnodes, num_rebalance_candidates, i;
2176 TALLOC_CTX *tmp_ctx = talloc_new(ipalloc_state);
2178 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2180 if (!lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2181 &lcp2_imbalances, &rebalance_candidates)) {
2186 lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2188 /* If we don't want IPs to fail back then don't rebalance IPs. */
2189 if (1 == ipalloc_state->no_ip_failback) {
2193 /* It is only worth continuing if we have suitable target
2194 * nodes to transfer IPs to. This check is much cheaper than
2197 numnodes = talloc_array_length(ipflags);
2198 num_rebalance_candidates = 0;
2199 for (i=0; i<numnodes; i++) {
2200 if (rebalance_candidates[i]) {
2201 num_rebalance_candidates++;
2204 if (num_rebalance_candidates == 0) {
2208 /* Now, try to make sure the ip adresses are evenly distributed
2211 lcp2_failback(ipalloc_state, ipflags, all_ips,
2212 lcp2_imbalances, rebalance_candidates);
2215 talloc_free(tmp_ctx);
2220 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2224 for (i=0;i<nodemap->num;i++) {
2225 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2226 /* Found one completely healthy node */
2234 /* The calculation part of the IP allocation algorithm. */
2235 static bool ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2236 struct ctdb_ipflags *ipflags,
2237 struct public_ip_list *all_ips,
2238 uint32_t *force_rebalance_nodes)
2242 switch (ipalloc_state->algorithm) {
2244 ret = ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2245 force_rebalance_nodes);
2247 case IPALLOC_DETERMINISTIC:
2248 ret = ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2250 case IPALLOC_NONDETERMINISTIC:
2251 ret = ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2255 /* at this point ->pnn is the node which will own each IP
2256 or -1 if there is no node that can cover this ip
2262 struct get_tunable_callback_data {
2263 const char *tunable;
2268 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2269 int32_t res, TDB_DATA outdata,
2272 struct get_tunable_callback_data *cd =
2273 (struct get_tunable_callback_data *)callback;
2277 /* Already handled in fail callback */
2281 if (outdata.dsize != sizeof(uint32_t)) {
2282 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2283 cd->tunable, pnn, (int)sizeof(uint32_t),
2284 (int)outdata.dsize));
2289 size = talloc_array_length(cd->out);
2291 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2292 cd->tunable, pnn, size));
2297 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2300 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2301 int32_t res, TDB_DATA outdata,
2304 struct get_tunable_callback_data *cd =
2305 (struct get_tunable_callback_data *)callback;
2310 ("Timed out getting tunable \"%s\" from node %d\n",
2316 DEBUG(DEBUG_WARNING,
2317 ("Tunable \"%s\" not implemented on node %d\n",
2322 ("Unexpected error getting tunable \"%s\" from node %d\n",
2328 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2329 TALLOC_CTX *tmp_ctx,
2330 struct ctdb_node_map_old *nodemap,
2331 const char *tunable,
2332 uint32_t default_value)
2335 struct ctdb_control_get_tunable *t;
2338 struct get_tunable_callback_data callback_data;
2341 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2342 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2343 for (i=0; i<nodemap->num; i++) {
2344 tvals[i] = default_value;
2347 callback_data.out = tvals;
2348 callback_data.tunable = tunable;
2349 callback_data.fatal = false;
2351 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2352 data.dptr = talloc_size(tmp_ctx, data.dsize);
2353 t = (struct ctdb_control_get_tunable *)data.dptr;
2354 t->length = strlen(tunable)+1;
2355 memcpy(t->name, tunable, t->length);
2356 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2357 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2358 nodes, 0, TAKEOVER_TIMEOUT(),
2360 get_tunable_callback,
2361 get_tunable_fail_callback,
2362 &callback_data) != 0) {
2363 if (callback_data.fatal) {
2369 talloc_free(data.dptr);
2374 /* Set internal flags for IP allocation:
2376 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2377 * Set NOIPHOST ip flag for each INACTIVE node
2378 * if all nodes are disabled:
2379 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2381 * Set NOIPHOST ip flags for disabled nodes
2383 static struct ctdb_ipflags *
2384 set_ipflags_internal(TALLOC_CTX *tmp_ctx,
2385 struct ctdb_node_map_old *nodemap,
2386 uint32_t *tval_noiptakeover,
2387 uint32_t *tval_noiphostonalldisabled)
2390 struct ctdb_ipflags *ipflags;
2392 /* Clear IP flags - implicit due to talloc_zero */
2393 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2394 if (ipflags == NULL) {
2395 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2399 for (i=0;i<nodemap->num;i++) {
2400 /* Can not take IPs on node with NoIPTakeover set */
2401 if (tval_noiptakeover[i] != 0) {
2402 ipflags[i].noiptakeover = true;
2405 /* Can not host IPs on INACTIVE node */
2406 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2407 ipflags[i].noiphost = true;
2411 if (all_nodes_are_disabled(nodemap)) {
2412 /* If all nodes are disabled, can not host IPs on node
2413 * with NoIPHostOnAllDisabled set
2415 for (i=0;i<nodemap->num;i++) {
2416 if (tval_noiphostonalldisabled[i] != 0) {
2417 ipflags[i].noiphost = true;
2421 /* If some nodes are not disabled, then can not host
2422 * IPs on DISABLED node
2424 for (i=0;i<nodemap->num;i++) {
2425 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2426 ipflags[i].noiphost = true;
2434 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2435 TALLOC_CTX *tmp_ctx,
2436 struct ctdb_node_map_old *nodemap)
2438 uint32_t *tval_noiptakeover;
2439 uint32_t *tval_noiphostonalldisabled;
2440 struct ctdb_ipflags *ipflags;
2443 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2445 if (tval_noiptakeover == NULL) {
2449 tval_noiphostonalldisabled =
2450 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2451 "NoIPHostOnAllDisabled", 0);
2452 if (tval_noiphostonalldisabled == NULL) {
2453 /* Caller frees tmp_ctx */
2457 ipflags = set_ipflags_internal(tmp_ctx, nodemap,
2459 tval_noiphostonalldisabled);
2461 talloc_free(tval_noiptakeover);
2462 talloc_free(tval_noiphostonalldisabled);
2467 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2468 TALLOC_CTX *mem_ctx)
2470 struct ipalloc_state *ipalloc_state =
2471 talloc_zero(mem_ctx, struct ipalloc_state);
2472 if (ipalloc_state == NULL) {
2473 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2477 ipalloc_state->num = ctdb->num_nodes;
2478 ipalloc_state->known_public_ips =
2479 talloc_zero_array(ipalloc_state,
2480 struct ctdb_public_ip_list_old *,
2481 ipalloc_state->num);
2482 if (ipalloc_state->known_public_ips == NULL) {
2483 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2484 talloc_free(ipalloc_state);
2487 ipalloc_state->available_public_ips =
2488 talloc_zero_array(ipalloc_state,
2489 struct ctdb_public_ip_list_old *,
2490 ipalloc_state->num);
2491 if (ipalloc_state->available_public_ips == NULL) {
2492 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2493 talloc_free(ipalloc_state);
2497 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2498 ipalloc_state->algorithm = IPALLOC_LCP2;
2499 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2500 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2502 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2505 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2507 return ipalloc_state;
2510 struct iprealloc_callback_data {
2513 client_async_callback fail_callback;
2514 void *fail_callback_data;
2515 struct ctdb_node_map_old *nodemap;
2518 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2519 int32_t res, TDB_DATA outdata,
2523 struct iprealloc_callback_data *cd =
2524 (struct iprealloc_callback_data *)callback;
2526 numnodes = talloc_array_length(cd->retry_nodes);
2527 if (pnn > numnodes) {
2529 ("ipreallocated failure from node %d, "
2530 "but only %d nodes in nodemap\n",
2535 /* Can't run the "ipreallocated" event on a INACTIVE node */
2536 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2537 DEBUG(DEBUG_WARNING,
2538 ("ipreallocated failed on inactive node %d, ignoring\n",
2545 /* If the control timed out then that's a real error,
2546 * so call the real fail callback
2548 if (cd->fail_callback) {
2549 cd->fail_callback(ctdb, pnn, res, outdata,
2550 cd->fail_callback_data);
2552 DEBUG(DEBUG_WARNING,
2553 ("iprealloc timed out but no callback registered\n"));
2557 /* If not a timeout then either the ipreallocated
2558 * eventscript (or some setup) failed. This might
2559 * have failed because the IPREALLOCATED control isn't
2560 * implemented - right now there is no way of knowing
2561 * because the error codes are all folded down to -1.
2562 * Consider retrying using EVENTSCRIPT control...
2564 DEBUG(DEBUG_WARNING,
2565 ("ipreallocated failure from node %d, flagging retry\n",
2567 cd->retry_nodes[pnn] = true;
2572 struct takeover_callback_data {
2574 client_async_callback fail_callback;
2575 void *fail_callback_data;
2576 struct ctdb_node_map_old *nodemap;
2579 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2580 uint32_t node_pnn, int32_t res,
2581 TDB_DATA outdata, void *callback_data)
2583 struct takeover_callback_data *cd =
2584 talloc_get_type_abort(callback_data,
2585 struct takeover_callback_data);
2588 for (i = 0; i < cd->nodemap->num; i++) {
2589 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2594 if (i == cd->nodemap->num) {
2595 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2599 if (!cd->node_failed[i]) {
2600 cd->node_failed[i] = true;
2601 cd->fail_callback(ctdb, node_pnn, res, outdata,
2602 cd->fail_callback_data);
2607 make any IP alias changes for public addresses that are necessary
2609 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2610 uint32_t *force_rebalance_nodes,
2611 client_async_callback fail_callback, void *callback_data)
2614 struct ctdb_public_ip ip;
2616 struct public_ip_list *all_ips, *tmp_ip;
2618 struct timeval timeout;
2619 struct client_async_data *async_data;
2620 struct ctdb_client_control_state *state;
2621 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2622 struct ctdb_ipflags *ipflags;
2623 struct ipalloc_state *ipalloc_state;
2624 struct takeover_callback_data *takeover_data;
2625 struct iprealloc_callback_data iprealloc_data;
2630 * ip failover is completely disabled, just send out the
2631 * ipreallocated event.
2633 if (ctdb->tunable.disable_ip_failover != 0) {
2637 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2638 if (ipalloc_state == NULL) {
2639 talloc_free(tmp_ctx);
2643 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2644 if (ipflags == NULL) {
2645 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2646 talloc_free(tmp_ctx);
2650 /* Fetch known/available public IPs from each active node */
2651 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2653 talloc_free(tmp_ctx);
2657 /* Short-circuit IP allocation if no node has available IPs */
2658 can_host_ips = false;
2659 for (i=0; i < ipalloc_state->num; i++) {
2660 if (ipalloc_state->available_public_ips[i] != NULL) {
2661 can_host_ips = true;
2664 if (!can_host_ips) {
2665 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2669 /* since nodes only know about those public addresses that
2670 can be served by that particular node, no single node has
2671 a full list of all public addresses that exist in the cluster.
2672 Walk over all node structures and create a merged list of
2673 all public addresses that exist in the cluster.
2675 keep the tree of ips around as ctdb->ip_tree
2677 all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2679 /* Do the IP reassignment calculations */
2680 ctdb_takeover_run_core(ipalloc_state, ipflags,
2681 all_ips, force_rebalance_nodes);
2683 /* Now tell all nodes to release any public IPs should not
2684 * host. This will be a NOOP on nodes that don't currently
2685 * hold the given IP.
2687 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2688 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2690 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2691 bool, nodemap->num);
2692 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2693 takeover_data->fail_callback = fail_callback;
2694 takeover_data->fail_callback_data = callback_data;
2695 takeover_data->nodemap = nodemap;
2697 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2698 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2700 async_data->fail_callback = takeover_run_fail_callback;
2701 async_data->callback_data = takeover_data;
2703 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2705 /* Send a RELEASE_IP to all nodes that should not be hosting
2706 * each IP. For each IP, all but one of these will be
2707 * redundant. However, the redundant ones are used to tell
2708 * nodes which node should be hosting the IP so that commands
2709 * like "ctdb ip" can display a particular nodes idea of who
2710 * is hosting what. */
2711 for (i=0;i<nodemap->num;i++) {
2712 /* don't talk to unconnected nodes, but do talk to banned nodes */
2713 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2717 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2718 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2719 /* This node should be serving this
2720 vnn so don't tell it to release the ip
2724 ip.pnn = tmp_ip->pnn;
2725 ip.addr = tmp_ip->addr;
2727 timeout = TAKEOVER_TIMEOUT();
2728 data.dsize = sizeof(ip);
2729 data.dptr = (uint8_t *)&ip;
2730 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2731 0, CTDB_CONTROL_RELEASE_IP, 0,
2734 if (state == NULL) {
2735 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2736 talloc_free(tmp_ctx);
2740 ctdb_client_async_add(async_data, state);
2743 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2744 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2745 talloc_free(tmp_ctx);
2748 talloc_free(async_data);
2751 /* For each IP, send a TAKOVER_IP to the node that should be
2752 * hosting it. Many of these will often be redundant (since
2753 * the allocation won't have changed) but they can be useful
2754 * to recover from inconsistencies. */
2755 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2756 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2758 async_data->fail_callback = fail_callback;
2759 async_data->callback_data = callback_data;
2761 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2762 if (tmp_ip->pnn == -1) {
2763 /* this IP won't be taken over */
2767 ip.pnn = tmp_ip->pnn;
2768 ip.addr = tmp_ip->addr;
2770 timeout = TAKEOVER_TIMEOUT();
2771 data.dsize = sizeof(ip);
2772 data.dptr = (uint8_t *)&ip;
2773 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2774 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2775 data, async_data, &timeout, NULL);
2776 if (state == NULL) {
2777 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2778 talloc_free(tmp_ctx);
2782 ctdb_client_async_add(async_data, state);
2784 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2785 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2786 talloc_free(tmp_ctx);
2792 * Tell all nodes to run eventscripts to process the
2793 * "ipreallocated" event. This can do a lot of things,
2794 * including restarting services to reconfigure them if public
2795 * IPs have moved. Once upon a time this event only used to
2798 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2799 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2800 iprealloc_data.retry_nodes = retry_data;
2801 iprealloc_data.retry_count = 0;
2802 iprealloc_data.fail_callback = fail_callback;
2803 iprealloc_data.fail_callback_data = callback_data;
2804 iprealloc_data.nodemap = nodemap;
2806 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2807 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2808 nodes, 0, TAKEOVER_TIMEOUT(),
2810 NULL, iprealloc_fail_callback,
2813 /* If the control failed then we should retry to any
2814 * nodes flagged by iprealloc_fail_callback using the
2815 * EVENTSCRIPT control. This is a best-effort at
2816 * backward compatiblity when running a mixed cluster
2817 * where some nodes have not yet been upgraded to
2818 * support the IPREALLOCATED control.
2820 DEBUG(DEBUG_WARNING,
2821 ("Retry ipreallocated to some nodes using eventscript control\n"));
2823 nodes = talloc_array(tmp_ctx, uint32_t,
2824 iprealloc_data.retry_count);
2825 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2828 for (i=0; i<nodemap->num; i++) {
2829 if (iprealloc_data.retry_nodes[i]) {
2835 data.dptr = discard_const("ipreallocated");
2836 data.dsize = strlen((char *)data.dptr) + 1;
2837 ret = ctdb_client_async_control(ctdb,
2838 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2839 nodes, 0, TAKEOVER_TIMEOUT(),
2841 NULL, fail_callback,
2844 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2848 talloc_free(tmp_ctx);
2854 destroy a ctdb_client_ip structure
2856 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2858 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2859 ctdb_addr_to_str(&ip->addr),
2860 ntohs(ip->addr.ip.sin_port),
2863 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2868 called by a client to inform us of a TCP connection that it is managing
2869 that should tickled with an ACK when IP takeover is done
2871 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2874 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2875 struct ctdb_connection *tcp_sock = NULL;
2876 struct ctdb_tcp_list *tcp;
2877 struct ctdb_connection t;
2880 struct ctdb_client_ip *ip;
2881 struct ctdb_vnn *vnn;
2882 ctdb_sock_addr addr;
2884 /* If we don't have public IPs, tickles are useless */
2885 if (ctdb->vnn == NULL) {
2889 tcp_sock = (struct ctdb_connection *)indata.dptr;
2891 addr = tcp_sock->src;
2892 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2893 addr = tcp_sock->dst;
2894 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2897 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2898 vnn = find_public_ip_vnn(ctdb, &addr);
2900 switch (addr.sa.sa_family) {
2902 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2903 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2904 ctdb_addr_to_str(&addr)));
2908 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2909 ctdb_addr_to_str(&addr)));
2912 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2918 if (vnn->pnn != ctdb->pnn) {
2919 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2920 ctdb_addr_to_str(&addr),
2921 client_id, client->pid));
2922 /* failing this call will tell smbd to die */
2926 ip = talloc(client, struct ctdb_client_ip);
2927 CTDB_NO_MEMORY(ctdb, ip);
2931 ip->client_id = client_id;
2932 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2933 DLIST_ADD(ctdb->client_ip_list, ip);
2935 tcp = talloc(client, struct ctdb_tcp_list);
2936 CTDB_NO_MEMORY(ctdb, tcp);
2938 tcp->connection.src = tcp_sock->src;
2939 tcp->connection.dst = tcp_sock->dst;
2941 DLIST_ADD(client->tcp_list, tcp);
2943 t.src = tcp_sock->src;
2944 t.dst = tcp_sock->dst;
2946 data.dptr = (uint8_t *)&t;
2947 data.dsize = sizeof(t);
2949 switch (addr.sa.sa_family) {
2951 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2952 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2953 ctdb_addr_to_str(&tcp_sock->src),
2954 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2957 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2958 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2959 ctdb_addr_to_str(&tcp_sock->src),
2960 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2963 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2967 /* tell all nodes about this tcp connection */
2968 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2969 CTDB_CONTROL_TCP_ADD,
2970 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2972 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2980 find a tcp address on a list
2982 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2983 struct ctdb_connection *tcp)
2987 if (array == NULL) {
2991 for (i=0;i<array->num;i++) {
2992 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2993 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2994 return &array->connections[i];
3003 called by a daemon to inform us of a TCP connection that one of its
3004 clients managing that should tickled with an ACK when IP takeover is
3007 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3009 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
3010 struct ctdb_tcp_array *tcparray;
3011 struct ctdb_connection tcp;
3012 struct ctdb_vnn *vnn;
3014 /* If we don't have public IPs, tickles are useless */
3015 if (ctdb->vnn == NULL) {
3019 vnn = find_public_ip_vnn(ctdb, &p->dst);
3021 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3022 ctdb_addr_to_str(&p->dst)));
3028 tcparray = vnn->tcp_array;
3030 /* If this is the first tickle */
3031 if (tcparray == NULL) {
3032 tcparray = talloc(vnn, struct ctdb_tcp_array);
3033 CTDB_NO_MEMORY(ctdb, tcparray);
3034 vnn->tcp_array = tcparray;
3037 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3038 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3040 tcparray->connections[tcparray->num].src = p->src;
3041 tcparray->connections[tcparray->num].dst = p->dst;
3044 if (tcp_update_needed) {
3045 vnn->tcp_update_needed = true;
3051 /* Do we already have this tickle ?*/
3054 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3055 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3056 ctdb_addr_to_str(&tcp.dst),
3057 ntohs(tcp.dst.ip.sin_port),
3062 /* A new tickle, we must add it to the array */
3063 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3064 struct ctdb_connection,
3066 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3068 tcparray->connections[tcparray->num].src = p->src;
3069 tcparray->connections[tcparray->num].dst = p->dst;
3072 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3073 ctdb_addr_to_str(&tcp.dst),
3074 ntohs(tcp.dst.ip.sin_port),
3077 if (tcp_update_needed) {
3078 vnn->tcp_update_needed = true;
3086 called by a daemon to inform us of a TCP connection that one of its
3087 clients managing that should tickled with an ACK when IP takeover is
3090 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3092 struct ctdb_connection *tcpp;
3093 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3096 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3097 ctdb_addr_to_str(&conn->dst)));
3101 /* if the array is empty we cant remove it
3102 and we don't need to do anything
3104 if (vnn->tcp_array == NULL) {
3105 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3106 ctdb_addr_to_str(&conn->dst),
3107 ntohs(conn->dst.ip.sin_port)));
3112 /* See if we know this connection
3113 if we don't know this connection then we dont need to do anything
3115 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3117 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3118 ctdb_addr_to_str(&conn->dst),
3119 ntohs(conn->dst.ip.sin_port)));
3124 /* We need to remove this entry from the array.
3125 Instead of allocating a new array and copying data to it
3126 we cheat and just copy the last entry in the existing array
3127 to the entry that is to be removed and just shring the
3130 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3131 vnn->tcp_array->num--;
3133 /* If we deleted the last entry we also need to remove the entire array
3135 if (vnn->tcp_array->num == 0) {
3136 talloc_free(vnn->tcp_array);
3137 vnn->tcp_array = NULL;
3140 vnn->tcp_update_needed = true;
3142 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3143 ctdb_addr_to_str(&conn->src),
3144 ntohs(conn->src.ip.sin_port)));
3149 called by a daemon to inform us of a TCP connection that one of its
3150 clients used are no longer needed in the tickle database
3152 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3154 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3156 /* If we don't have public IPs, tickles are useless */
3157 if (ctdb->vnn == NULL) {
3161 ctdb_remove_connection(ctdb, conn);
3168 Called when another daemon starts - causes all tickles for all
3169 public addresses we are serving to be sent to the new node on the
3170 next check. This actually causes the next scheduled call to
3171 tdb_update_tcp_tickles() to update all nodes. This is simple and
3172 doesn't require careful error handling.
3174 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3176 struct ctdb_vnn *vnn;
3178 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3179 (unsigned long) pnn));
3181 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3182 vnn->tcp_update_needed = true;
3190 called when a client structure goes away - hook to remove
3191 elements from the tcp_list in all daemons
3193 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3195 while (client->tcp_list) {
3196 struct ctdb_tcp_list *tcp = client->tcp_list;
3197 DLIST_REMOVE(client->tcp_list, tcp);
3198 ctdb_remove_connection(client->ctdb, &tcp->connection);
3203 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3205 struct ctdb_vnn *vnn;
3208 if (ctdb->tunable.disable_ip_failover == 1) {
3212 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3213 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3214 ctdb_vnn_unassign_iface(ctdb, vnn);
3221 /* Don't allow multiple releases at once. Some code,
3222 * particularly ctdb_tickle_sentenced_connections() is
3224 if (vnn->update_in_flight) {
3225 DEBUG(DEBUG_WARNING,
3227 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3228 ctdb_addr_to_str(&vnn->public_address),
3229 vnn->public_netmask_bits,
3230 ctdb_vnn_iface_string(vnn)));
3233 vnn->update_in_flight = true;
3235 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3236 ctdb_addr_to_str(&vnn->public_address),
3237 vnn->public_netmask_bits,
3238 ctdb_vnn_iface_string(vnn)));
3240 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3241 ctdb_vnn_iface_string(vnn),
3242 ctdb_addr_to_str(&vnn->public_address),
3243 vnn->public_netmask_bits);
3244 release_kill_clients(ctdb, &vnn->public_address);
3245 ctdb_vnn_unassign_iface(ctdb, vnn);
3246 vnn->update_in_flight = false;
3250 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3255 get list of public IPs
3257 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3258 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3261 struct ctdb_public_ip_list_old *ips;
3262 struct ctdb_vnn *vnn;
3263 bool only_available = false;
3265 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3266 only_available = true;
3269 /* count how many public ip structures we have */
3271 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3275 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3276 num*sizeof(struct ctdb_public_ip);
3277 ips = talloc_zero_size(outdata, len);
3278 CTDB_NO_MEMORY(ctdb, ips);
3281 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3282 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3285 ips->ips[i].pnn = vnn->pnn;
3286 ips->ips[i].addr = vnn->public_address;
3290 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3291 i*sizeof(struct ctdb_public_ip);
3293 outdata->dsize = len;
3294 outdata->dptr = (uint8_t *)ips;
3300 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3301 struct ctdb_req_control_old *c,
3306 ctdb_sock_addr *addr;
3307 struct ctdb_public_ip_info_old *info;
3308 struct ctdb_vnn *vnn;
3310 addr = (ctdb_sock_addr *)indata.dptr;
3312 vnn = find_public_ip_vnn(ctdb, addr);
3314 /* if it is not a public ip it could be our 'single ip' */
3315 if (ctdb->single_ip_vnn) {
3316 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3317 vnn = ctdb->single_ip_vnn;
3322 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3323 "'%s'not a public address\n",
3324 ctdb_addr_to_str(addr)));
3328 /* count how many public ip structures we have */
3330 for (;vnn->ifaces[num];) {
3334 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3335 num*sizeof(struct ctdb_iface);
3336 info = talloc_zero_size(outdata, len);
3337 CTDB_NO_MEMORY(ctdb, info);
3339 info->ip.addr = vnn->public_address;
3340 info->ip.pnn = vnn->pnn;
3341 info->active_idx = 0xFFFFFFFF;
3343 for (i=0; vnn->ifaces[i]; i++) {
3344 struct ctdb_interface *cur;
3346 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3348 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3352 if (vnn->iface == cur) {
3353 info->active_idx = i;
3355 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3356 info->ifaces[i].link_state = cur->link_up;
3357 info->ifaces[i].references = cur->references;
3360 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3361 i*sizeof(struct ctdb_iface);
3363 outdata->dsize = len;
3364 outdata->dptr = (uint8_t *)info;
3369 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3370 struct ctdb_req_control_old *c,
3374 struct ctdb_iface_list_old *ifaces;
3375 struct ctdb_interface *cur;
3377 /* count how many public ip structures we have */
3379 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3383 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3384 num*sizeof(struct ctdb_iface);
3385 ifaces = talloc_zero_size(outdata, len);
3386 CTDB_NO_MEMORY(ctdb, ifaces);
3389 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3390 strcpy(ifaces->ifaces[i].name, cur->name);
3391 ifaces->ifaces[i].link_state = cur->link_up;
3392 ifaces->ifaces[i].references = cur->references;
3396 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3397 i*sizeof(struct ctdb_iface);
3399 outdata->dsize = len;
3400 outdata->dptr = (uint8_t *)ifaces;
3405 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3406 struct ctdb_req_control_old *c,
3409 struct ctdb_iface *info;
3410 struct ctdb_interface *iface;
3411 bool link_up = false;
3413 info = (struct ctdb_iface *)indata.dptr;
3415 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3416 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3417 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3418 len, len, info->name));
3422 switch (info->link_state) {
3430 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3431 (unsigned int)info->link_state));
3435 if (info->references != 0) {
3436 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3437 (unsigned int)info->references));
3441 iface = ctdb_find_iface(ctdb, info->name);
3442 if (iface == NULL) {
3446 if (link_up == iface->link_up) {
3450 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3451 ("iface[%s] has changed it's link status %s => %s\n",
3453 iface->link_up?"up":"down",
3454 link_up?"up":"down"));
3456 iface->link_up = link_up;
3462 structure containing the listening socket and the list of tcp connections
3463 that the ctdb daemon is to kill
3465 struct ctdb_kill_tcp {
3466 struct ctdb_vnn *vnn;
3467 struct ctdb_context *ctdb;
3469 struct tevent_fd *fde;
3470 trbt_tree_t *connections;
3475 a tcp connection that is to be killed
3477 struct ctdb_killtcp_con {
3478 ctdb_sock_addr src_addr;
3479 ctdb_sock_addr dst_addr;
3481 struct ctdb_kill_tcp *killtcp;
3484 /* this function is used to create a key to represent this socketpair
3485 in the killtcp tree.
3486 this key is used to insert and lookup matching socketpairs that are
3487 to be tickled and RST
3489 #define KILLTCP_KEYLEN 10
3490 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3492 static uint32_t key[KILLTCP_KEYLEN];
3494 bzero(key, sizeof(key));
3496 if (src->sa.sa_family != dst->sa.sa_family) {
3497 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3501 switch (src->sa.sa_family) {
3503 key[0] = dst->ip.sin_addr.s_addr;
3504 key[1] = src->ip.sin_addr.s_addr;
3505 key[2] = dst->ip.sin_port;
3506 key[3] = src->ip.sin_port;
3509 uint32_t *dst6_addr32 =
3510 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3511 uint32_t *src6_addr32 =
3512 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3513 key[0] = dst6_addr32[3];
3514 key[1] = src6_addr32[3];
3515 key[2] = dst6_addr32[2];
3516 key[3] = src6_addr32[2];
3517 key[4] = dst6_addr32[1];
3518 key[5] = src6_addr32[1];
3519 key[6] = dst6_addr32[0];
3520 key[7] = src6_addr32[0];
3521 key[8] = dst->ip6.sin6_port;
3522 key[9] = src->ip6.sin6_port;
3526 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3534 called when we get a read event on the raw socket
3536 static void capture_tcp_handler(struct tevent_context *ev,
3537 struct tevent_fd *fde,
3538 uint16_t flags, void *private_data)
3540 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3541 struct ctdb_killtcp_con *con;
3542 ctdb_sock_addr src, dst;
3543 uint32_t ack_seq, seq;
3545 if (!(flags & TEVENT_FD_READ)) {
3549 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3550 killtcp->private_data,
3552 &ack_seq, &seq) != 0) {
3553 /* probably a non-tcp ACK packet */
3557 /* check if we have this guy in our list of connections
3560 con = trbt_lookuparray32(killtcp->connections,
3561 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3563 /* no this was some other packet we can just ignore */
3567 /* This one has been tickled !
3568 now reset him and remove him from the list.
3570 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3571 ntohs(con->dst_addr.ip.sin_port),
3572 ctdb_addr_to_str(&con->src_addr),
3573 ntohs(con->src_addr.ip.sin_port)));
3575 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3580 /* when traversing the list of all tcp connections to send tickle acks to
3581 (so that we can capture the ack coming back and kill the connection
3583 this callback is called for each connection we are currently trying to kill
3585 static int tickle_connection_traverse(void *param, void *data)
3587 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3589 /* have tried too many times, just give up */
3590 if (con->count >= 5) {
3591 /* can't delete in traverse: reparent to delete_cons */
3592 talloc_steal(param, con);
3596 /* othervise, try tickling it again */
3599 (ctdb_sock_addr *)&con->dst_addr,
3600 (ctdb_sock_addr *)&con->src_addr,
3607 called every second until all sentenced connections have been reset
3609 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3610 struct tevent_timer *te,
3611 struct timeval t, void *private_data)
3613 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3614 void *delete_cons = talloc_new(NULL);
3616 /* loop over all connections sending tickle ACKs */
3617 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3619 /* now we've finished traverse, it's safe to do deletion. */
3620 talloc_free(delete_cons);
3622 /* If there are no more connections to kill we can remove the
3623 entire killtcp structure
3625 if ( (killtcp->connections == NULL) ||
3626 (killtcp->connections->root == NULL) ) {
3627 talloc_free(killtcp);
3631 /* try tickling them again in a seconds time
3633 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3634 timeval_current_ofs(1, 0),
3635 ctdb_tickle_sentenced_connections, killtcp);
3639 destroy the killtcp structure
3641 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3643 struct ctdb_vnn *tmpvnn;
3645 /* verify that this vnn is still active */
3646 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3647 if (tmpvnn == killtcp->vnn) {
3652 if (tmpvnn == NULL) {
3656 if (killtcp->vnn->killtcp != killtcp) {
3660 killtcp->vnn->killtcp = NULL;
3666 /* nothing fancy here, just unconditionally replace any existing
3667 connection structure with the new one.
3669 don't even free the old one if it did exist, that one is talloc_stolen
3670 by the same node in the tree anyway and will be deleted when the new data
3673 static void *add_killtcp_callback(void *parm, void *data)
3679 add a tcp socket to the list of connections we want to RST
3681 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3685 ctdb_sock_addr src, dst;
3686 struct ctdb_kill_tcp *killtcp;
3687 struct ctdb_killtcp_con *con;
3688 struct ctdb_vnn *vnn;
3690 ctdb_canonicalize_ip(s, &src);
3691 ctdb_canonicalize_ip(d, &dst);
3693 vnn = find_public_ip_vnn(ctdb, &dst);
3695 vnn = find_public_ip_vnn(ctdb, &src);
3698 /* if it is not a public ip it could be our 'single ip' */
3699 if (ctdb->single_ip_vnn) {
3700 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3701 vnn = ctdb->single_ip_vnn;
3706 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3710 killtcp = vnn->killtcp;
3712 /* If this is the first connection to kill we must allocate
3715 if (killtcp == NULL) {
3716 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3717 CTDB_NO_MEMORY(ctdb, killtcp);
3720 killtcp->ctdb = ctdb;
3721 killtcp->capture_fd = -1;
3722 killtcp->connections = trbt_create(killtcp, 0);
3724 vnn->killtcp = killtcp;
3725 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3730 /* create a structure that describes this connection we want to
3731 RST and store it in killtcp->connections
3733 con = talloc(killtcp, struct ctdb_killtcp_con);
3734 CTDB_NO_MEMORY(ctdb, con);
3735 con->src_addr = src;
3736 con->dst_addr = dst;
3738 con->killtcp = killtcp;
3741 trbt_insertarray32_callback(killtcp->connections,
3742 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3743 add_killtcp_callback, con);
3746 If we don't have a socket to listen on yet we must create it
3748 if (killtcp->capture_fd == -1) {
3749 const char *iface = ctdb_vnn_iface_string(vnn);
3750 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3751 if (killtcp->capture_fd == -1) {
3752 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3753 "socket on iface '%s' for killtcp (%s)\n",
3754 iface, strerror(errno)));
3760 if (killtcp->fde == NULL) {
3761 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3762 killtcp->capture_fd,
3764 capture_tcp_handler, killtcp);
3765 tevent_fd_set_auto_close(killtcp->fde);
3767 /* We also need to set up some events to tickle all these connections
3768 until they are all reset
3770 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3771 ctdb_tickle_sentenced_connections, killtcp);
3774 /* tickle him once now */
3783 talloc_free(vnn->killtcp);
3784 vnn->killtcp = NULL;
3789 kill a TCP connection.
3791 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3793 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3795 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3799 called by a daemon to inform us of the entire list of TCP tickles for
3800 a particular public address.
3801 this control should only be sent by the node that is currently serving
3802 that public address.
3804 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3806 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3807 struct ctdb_tcp_array *tcparray;
3808 struct ctdb_vnn *vnn;
3810 /* We must at least have tickles.num or else we cant verify the size
3811 of the received data blob
3813 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3814 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3818 /* verify that the size of data matches what we expect */
3819 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3820 + sizeof(struct ctdb_connection) * list->num) {
3821 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3825 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3826 ctdb_addr_to_str(&list->addr)));
3828 vnn = find_public_ip_vnn(ctdb, &list->addr);
3830 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3831 ctdb_addr_to_str(&list->addr)));
3836 /* remove any old ticklelist we might have */
3837 talloc_free(vnn->tcp_array);
3838 vnn->tcp_array = NULL;
3840 tcparray = talloc(vnn, struct ctdb_tcp_array);
3841 CTDB_NO_MEMORY(ctdb, tcparray);
3843 tcparray->num = list->num;
3845 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3846 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3848 memcpy(tcparray->connections, &list->connections[0],
3849 sizeof(struct ctdb_connection)*tcparray->num);
3851 /* We now have a new fresh tickle list array for this vnn */
3852 vnn->tcp_array = tcparray;
3858 called to return the full list of tickles for the puclic address associated
3859 with the provided vnn
3861 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3863 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3864 struct ctdb_tickle_list_old *list;
3865 struct ctdb_tcp_array *tcparray;
3867 struct ctdb_vnn *vnn;
3869 vnn = find_public_ip_vnn(ctdb, addr);
3871 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3872 ctdb_addr_to_str(addr)));
3877 tcparray = vnn->tcp_array;
3879 num = tcparray->num;
3884 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3885 + sizeof(struct ctdb_connection) * num;
3887 outdata->dptr = talloc_size(outdata, outdata->dsize);
3888 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3889 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3894 memcpy(&list->connections[0], tcparray->connections,
3895 sizeof(struct ctdb_connection) * num);
3903 set the list of all tcp tickles for a public address
3905 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3906 ctdb_sock_addr *addr,
3907 struct ctdb_tcp_array *tcparray)
3911 struct ctdb_tickle_list_old *list;
3914 num = tcparray->num;
3919 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3920 sizeof(struct ctdb_connection) * num;
3921 data.dptr = talloc_size(ctdb, data.dsize);
3922 CTDB_NO_MEMORY(ctdb, data.dptr);
3924 list = (struct ctdb_tickle_list_old *)data.dptr;
3928 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3931 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3932 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3933 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3935 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3939 talloc_free(data.dptr);
3946 perform tickle updates if required
3948 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3949 struct tevent_timer *te,
3950 struct timeval t, void *private_data)
3952 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3954 struct ctdb_vnn *vnn;
3956 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3957 /* we only send out updates for public addresses that
3960 if (ctdb->pnn != vnn->pnn) {
3963 /* We only send out the updates if we need to */
3964 if (!vnn->tcp_update_needed) {
3967 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3968 &vnn->public_address,
3971 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3972 ctdb_addr_to_str(&vnn->public_address)));
3975 ("Sent tickle update for public address %s\n",
3976 ctdb_addr_to_str(&vnn->public_address)));
3977 vnn->tcp_update_needed = false;
3981 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3982 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3983 ctdb_update_tcp_tickles, ctdb);
3987 start periodic update of tcp tickles
3989 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3991 ctdb->tickle_update_context = talloc_new(ctdb);
3993 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3994 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3995 ctdb_update_tcp_tickles, ctdb);
4001 struct control_gratious_arp {
4002 struct ctdb_context *ctdb;
4003 ctdb_sock_addr addr;
4009 send a control_gratuitous arp
4011 static void send_gratious_arp(struct tevent_context *ev,
4012 struct tevent_timer *te,
4013 struct timeval t, void *private_data)
4016 struct control_gratious_arp *arp = talloc_get_type(private_data,
4017 struct control_gratious_arp);
4019 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4021 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4022 arp->iface, strerror(errno)));
4027 if (arp->count == CTDB_ARP_REPEAT) {
4032 tevent_add_timer(arp->ctdb->ev, arp,
4033 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4034 send_gratious_arp, arp);
4041 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4043 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4044 struct control_gratious_arp *arp;
4046 /* verify the size of indata */
4047 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4048 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4049 (unsigned)indata.dsize,
4050 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4054 ( offsetof(struct ctdb_addr_info_old, iface)
4055 + gratious_arp->len ) ){
4057 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4058 "but should be %u bytes\n",
4059 (unsigned)indata.dsize,
4060 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4065 arp = talloc(ctdb, struct control_gratious_arp);
4066 CTDB_NO_MEMORY(ctdb, arp);
4069 arp->addr = gratious_arp->addr;
4070 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4071 CTDB_NO_MEMORY(ctdb, arp->iface);
4074 tevent_add_timer(arp->ctdb->ev, arp,
4075 timeval_zero(), send_gratious_arp, arp);
4080 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4082 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4085 /* verify the size of indata */
4086 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4087 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4091 ( offsetof(struct ctdb_addr_info_old, iface)
4094 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4095 "but should be %u bytes\n",
4096 (unsigned)indata.dsize,
4097 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4101 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4103 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4106 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4113 struct delete_ip_callback_state {
4114 struct ctdb_req_control_old *c;
4118 called when releaseip event finishes for del_public_address
4120 static void delete_ip_callback(struct ctdb_context *ctdb,
4121 int32_t status, TDB_DATA data,
4122 const char *errormsg,
4125 struct delete_ip_callback_state *state =
4126 talloc_get_type(private_data, struct delete_ip_callback_state);
4128 /* If release failed then fail. */
4129 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4130 talloc_free(private_data);
4133 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4134 struct ctdb_req_control_old *c,
4135 TDB_DATA indata, bool *async_reply)
4137 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4138 struct ctdb_vnn *vnn;
4140 /* verify the size of indata */
4141 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4142 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4146 ( offsetof(struct ctdb_addr_info_old, iface)
4149 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4150 "but should be %u bytes\n",
4151 (unsigned)indata.dsize,
4152 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4156 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4158 /* walk over all public addresses until we find a match */
4159 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4160 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4161 if (vnn->pnn == ctdb->pnn) {
4162 struct delete_ip_callback_state *state;
4163 struct ctdb_public_ip *ip;
4167 vnn->delete_pending = true;
4169 state = talloc(ctdb,
4170 struct delete_ip_callback_state);
4171 CTDB_NO_MEMORY(ctdb, state);
4174 ip = talloc(state, struct ctdb_public_ip);
4177 (__location__ " Out of memory\n"));
4182 ip->addr = pub->addr;
4184 data.dsize = sizeof(struct ctdb_public_ip);
4185 data.dptr = (unsigned char *)ip;
4187 ret = ctdb_daemon_send_control(ctdb,
4190 CTDB_CONTROL_RELEASE_IP,
4197 (__location__ "Unable to send "
4198 "CTDB_CONTROL_RELEASE_IP\n"));
4203 state->c = talloc_steal(state, c);
4204 *async_reply = true;
4206 /* This IP is not hosted on the
4207 * current node so just delete it
4209 do_delete_ip(ctdb, vnn);
4216 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4217 ctdb_addr_to_str(&pub->addr)));
4222 struct ipreallocated_callback_state {
4223 struct ctdb_req_control_old *c;
4226 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4227 int status, void *p)
4229 struct ipreallocated_callback_state *state =
4230 talloc_get_type(p, struct ipreallocated_callback_state);
4234 (" \"ipreallocated\" event script failed (status %d)\n",
4236 if (status == -ETIME) {
4237 ctdb_ban_self(ctdb);
4241 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4245 /* A control to run the ipreallocated event */
4246 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4247 struct ctdb_req_control_old *c,
4251 struct ipreallocated_callback_state *state;
4253 state = talloc(ctdb, struct ipreallocated_callback_state);
4254 CTDB_NO_MEMORY(ctdb, state);
4256 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4258 ret = ctdb_event_script_callback(ctdb, state,
4259 ctdb_ipreallocated_callback, state,
4260 CTDB_EVENT_IPREALLOCATED,
4264 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4269 /* tell the control that we will be reply asynchronously */
4270 state->c = talloc_steal(state, c);
4271 *async_reply = true;
4277 /* This function is called from the recovery daemon to verify that a remote
4278 node has the expected ip allocation.
4279 This is verified against ctdb->ip_tree
4281 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4282 struct ctdb_public_ip_list_old *ips,
4285 struct public_ip_list *tmp_ip;
4288 if (ctdb->ip_tree == NULL) {
4289 /* don't know the expected allocation yet, assume remote node
4298 for (i=0; i<ips->num; i++) {
4299 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4300 if (tmp_ip == NULL) {
4301 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4305 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4309 if (tmp_ip->pnn != ips->ips[i].pnn) {
4311 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4313 ctdb_addr_to_str(&ips->ips[i].addr),
4314 ips->ips[i].pnn, tmp_ip->pnn));
4322 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4324 struct public_ip_list *tmp_ip;
4326 /* IP tree is never built if DisableIPFailover is set */
4327 if (ctdb->tunable.disable_ip_failover != 0) {
4331 if (ctdb->ip_tree == NULL) {
4332 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4336 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4337 if (tmp_ip == NULL) {
4338 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4342 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4343 tmp_ip->pnn = ip->pnn;
4348 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4350 TALLOC_FREE(ctdb->ip_tree);
4353 struct ctdb_reloadips_handle {
4354 struct ctdb_context *ctdb;
4355 struct ctdb_req_control_old *c;
4359 struct tevent_fd *fde;
4362 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4364 if (h == h->ctdb->reload_ips) {
4365 h->ctdb->reload_ips = NULL;
4368 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4371 ctdb_kill(h->ctdb, h->child, SIGKILL);
4375 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4376 struct tevent_timer *te,
4377 struct timeval t, void *private_data)
4379 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4384 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4385 struct tevent_fd *fde,
4386 uint16_t flags, void *private_data)
4388 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4393 ret = sys_read(h->fd[0], &res, 1);
4394 if (ret < 1 || res != 0) {
4395 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4403 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4405 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4406 struct ctdb_public_ip_list_old *ips;
4407 struct ctdb_vnn *vnn;
4408 struct client_async_data *async_data;
4409 struct timeval timeout;
4411 struct ctdb_client_control_state *state;
4415 CTDB_NO_MEMORY(ctdb, mem_ctx);
4417 /* Read IPs from local node */
4418 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4419 CTDB_CURRENT_NODE, mem_ctx, &ips);
4422 ("Unable to fetch public IPs from local node\n"));
4423 talloc_free(mem_ctx);
4427 /* Read IPs file - this is safe since this is a child process */
4429 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4430 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4431 talloc_free(mem_ctx);
4435 async_data = talloc_zero(mem_ctx, struct client_async_data);
4436 CTDB_NO_MEMORY(ctdb, async_data);
4438 /* Compare IPs between node and file for IPs to be deleted */
4439 for (i = 0; i < ips->num; i++) {
4441 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4442 if (ctdb_same_ip(&vnn->public_address,
4443 &ips->ips[i].addr)) {
4444 /* IP is still in file */
4450 /* Delete IP ips->ips[i] */
4451 struct ctdb_addr_info_old *pub;
4454 ("IP %s no longer configured, deleting it\n",
4455 ctdb_addr_to_str(&ips->ips[i].addr)));
4457 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4458 CTDB_NO_MEMORY(ctdb, pub);
4460 pub->addr = ips->ips[i].addr;
4464 timeout = TAKEOVER_TIMEOUT();
4466 data.dsize = offsetof(struct ctdb_addr_info_old,
4468 data.dptr = (uint8_t *)pub;
4470 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4471 CTDB_CONTROL_DEL_PUBLIC_IP,
4472 0, data, async_data,
4474 if (state == NULL) {
4477 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4481 ctdb_client_async_add(async_data, state);
4485 /* Compare IPs between node and file for IPs to be added */
4487 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4488 for (i = 0; i < ips->num; i++) {
4489 if (ctdb_same_ip(&vnn->public_address,
4490 &ips->ips[i].addr)) {
4491 /* IP already on node */
4495 if (i == ips->num) {
4496 /* Add IP ips->ips[i] */
4497 struct ctdb_addr_info_old *pub;
4498 const char *ifaces = NULL;
4503 ("New IP %s configured, adding it\n",
4504 ctdb_addr_to_str(&vnn->public_address)));
4506 uint32_t pnn = ctdb_get_pnn(ctdb);
4508 data.dsize = sizeof(pnn);
4509 data.dptr = (uint8_t *)&pnn;
4511 ret = ctdb_client_send_message(
4513 CTDB_BROADCAST_CONNECTED,
4514 CTDB_SRVID_REBALANCE_NODE,
4517 DEBUG(DEBUG_WARNING,
4518 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4524 ifaces = vnn->ifaces[0];
4526 while (vnn->ifaces[iface] != NULL) {
4527 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4528 vnn->ifaces[iface]);
4532 len = strlen(ifaces) + 1;
4533 pub = talloc_zero_size(mem_ctx,
4534 offsetof(struct ctdb_addr_info_old, iface) + len);
4535 CTDB_NO_MEMORY(ctdb, pub);
4537 pub->addr = vnn->public_address;
4538 pub->mask = vnn->public_netmask_bits;
4540 memcpy(&pub->iface[0], ifaces, pub->len);
4542 timeout = TAKEOVER_TIMEOUT();
4544 data.dsize = offsetof(struct ctdb_addr_info_old,
4546 data.dptr = (uint8_t *)pub;
4548 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4549 CTDB_CONTROL_ADD_PUBLIC_IP,
4550 0, data, async_data,
4552 if (state == NULL) {
4555 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4559 ctdb_client_async_add(async_data, state);
4563 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4564 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4568 talloc_free(mem_ctx);
4572 talloc_free(mem_ctx);
4576 /* This control is sent to force the node to re-read the public addresses file
4577 and drop any addresses we should nnot longer host, and add new addresses
4578 that we are now able to host
4580 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4582 struct ctdb_reloadips_handle *h;
4583 pid_t parent = getpid();
4585 if (ctdb->reload_ips != NULL) {
4586 talloc_free(ctdb->reload_ips);
4587 ctdb->reload_ips = NULL;
4590 h = talloc(ctdb, struct ctdb_reloadips_handle);
4591 CTDB_NO_MEMORY(ctdb, h);
4596 if (pipe(h->fd) == -1) {
4597 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4602 h->child = ctdb_fork(ctdb);
4603 if (h->child == (pid_t)-1) {
4604 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4612 if (h->child == 0) {
4613 signed char res = 0;
4616 debug_extra = talloc_asprintf(NULL, "reloadips:");
4618 prctl_set_comment("ctdb_reloadips");
4619 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4620 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4623 res = ctdb_reloadips_child(ctdb);
4625 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4629 sys_write(h->fd[1], &res, 1);
4630 /* make sure we die when our parent dies */
4631 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4637 h->c = talloc_steal(h, c);
4640 set_close_on_exec(h->fd[0]);
4642 talloc_set_destructor(h, ctdb_reloadips_destructor);
4645 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4646 ctdb_reloadips_child_handler, (void *)h);
4647 tevent_fd_set_auto_close(h->fde);
4649 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4650 ctdb_reloadips_timeout_event, h);
4652 /* we reply later */
4653 *async_reply = true;