4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
56 enum ipalloc_algorithm {
57 IPALLOC_DETERMINISTIC,
58 IPALLOC_NONDETERMINISTIC,
62 struct ipalloc_state {
65 /* Arrays with data for each node */
66 struct ctdb_public_ip_list_old **known_public_ips;
67 struct ctdb_public_ip_list_old **available_public_ips;
69 enum ipalloc_algorithm algorithm;
70 uint32_t no_ip_failback;
73 struct ctdb_interface {
74 struct ctdb_interface *prev, *next;
80 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
83 return vnn->iface->name;
89 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
91 struct ctdb_interface *i;
93 /* Verify that we don't have an entry for this ip yet */
94 for (i=ctdb->ifaces;i;i=i->next) {
95 if (strcmp(i->name, iface) == 0) {
100 /* create a new structure for this interface */
101 i = talloc_zero(ctdb, struct ctdb_interface);
102 CTDB_NO_MEMORY_FATAL(ctdb, i);
103 i->name = talloc_strdup(i, iface);
104 CTDB_NO_MEMORY(ctdb, i->name);
108 DLIST_ADD(ctdb->ifaces, i);
113 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
118 for (n = 0; vnn->ifaces[n] != NULL; n++) {
119 if (strcmp(name, vnn->ifaces[n]) == 0) {
127 /* If any interfaces now have no possible IPs then delete them. This
128 * implementation is naive (i.e. simple) rather than clever
129 * (i.e. complex). Given that this is run on delip and that operation
130 * is rare, this doesn't need to be efficient - it needs to be
131 * foolproof. One alternative is reference counting, where the logic
132 * is distributed and can, therefore, be broken in multiple places.
133 * Another alternative is to build a red-black tree of interfaces that
134 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
135 * once) and then walking ctdb->ifaces once and deleting those not in
136 * the tree. Let's go to one of those if the naive implementation
137 * causes problems... :-)
139 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
140 struct ctdb_vnn *vnn)
142 struct ctdb_interface *i, *next;
144 /* For each interface, check if there's an IP using it. */
145 for (i = ctdb->ifaces; i != NULL; i = next) {
150 /* Only consider interfaces named in the given VNN. */
151 if (!vnn_has_interface_with_name(vnn, i->name)) {
155 /* Is the "single IP" on this interface? */
156 if ((ctdb->single_ip_vnn != NULL) &&
157 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
158 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
159 /* Found, next interface please... */
162 /* Search for a vnn with this interface. */
164 for (tv=ctdb->vnn; tv; tv=tv->next) {
165 if (vnn_has_interface_with_name(tv, i->name)) {
172 /* None of the VNNs are using this interface. */
173 DLIST_REMOVE(ctdb->ifaces, i);
180 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
183 struct ctdb_interface *i;
185 for (i=ctdb->ifaces;i;i=i->next) {
186 if (strcmp(i->name, iface) == 0) {
194 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
195 struct ctdb_vnn *vnn)
198 struct ctdb_interface *cur = NULL;
199 struct ctdb_interface *best = NULL;
201 for (i=0; vnn->ifaces[i]; i++) {
203 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
217 if (cur->references < best->references) {
226 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
227 struct ctdb_vnn *vnn)
229 struct ctdb_interface *best = NULL;
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "still assigned to iface '%s'\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn)));
239 best = ctdb_vnn_best_iface(ctdb, vnn);
241 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
242 "cannot assign to iface any iface\n",
243 ctdb_addr_to_str(&vnn->public_address)));
249 vnn->pnn = ctdb->pnn;
251 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
252 "now assigned to iface '%s' refs[%d]\n",
253 ctdb_addr_to_str(&vnn->public_address),
254 ctdb_vnn_iface_string(vnn),
259 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
260 struct ctdb_vnn *vnn)
262 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
263 "now unassigned (old iface '%s' refs[%d])\n",
264 ctdb_addr_to_str(&vnn->public_address),
265 ctdb_vnn_iface_string(vnn),
266 vnn->iface?vnn->iface->references:0));
268 vnn->iface->references--;
271 if (vnn->pnn == ctdb->pnn) {
276 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
277 struct ctdb_vnn *vnn)
281 /* Nodes that are not RUNNING can not host IPs */
282 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
286 if (vnn->delete_pending) {
290 if (vnn->iface && vnn->iface->link_up) {
294 for (i=0; vnn->ifaces[i]; i++) {
295 struct ctdb_interface *cur;
297 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
310 struct ctdb_takeover_arp {
311 struct ctdb_context *ctdb;
314 struct ctdb_tcp_array *tcparray;
315 struct ctdb_vnn *vnn;
320 lists of tcp endpoints
322 struct ctdb_tcp_list {
323 struct ctdb_tcp_list *prev, *next;
324 struct ctdb_connection connection;
328 list of clients to kill on IP release
330 struct ctdb_client_ip {
331 struct ctdb_client_ip *prev, *next;
332 struct ctdb_context *ctdb;
339 send a gratuitous arp
341 static void ctdb_control_send_arp(struct tevent_context *ev,
342 struct tevent_timer *te,
343 struct timeval t, void *private_data)
345 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
346 struct ctdb_takeover_arp);
348 struct ctdb_tcp_array *tcparray;
349 const char *iface = ctdb_vnn_iface_string(arp->vnn);
351 ret = ctdb_sys_send_arp(&arp->addr, iface);
353 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
354 iface, strerror(errno)));
357 tcparray = arp->tcparray;
359 for (i=0;i<tcparray->num;i++) {
360 struct ctdb_connection *tcon;
362 tcon = &tcparray->connections[i];
363 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
364 (unsigned)ntohs(tcon->dst.ip.sin_port),
365 ctdb_addr_to_str(&tcon->src),
366 (unsigned)ntohs(tcon->src.ip.sin_port)));
367 ret = ctdb_sys_send_tcp(
372 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
373 ctdb_addr_to_str(&tcon->src)));
380 if (arp->count == CTDB_ARP_REPEAT) {
385 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
386 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
387 ctdb_control_send_arp, arp);
390 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
391 struct ctdb_vnn *vnn)
393 struct ctdb_takeover_arp *arp;
394 struct ctdb_tcp_array *tcparray;
396 if (!vnn->takeover_ctx) {
397 vnn->takeover_ctx = talloc_new(vnn);
398 if (!vnn->takeover_ctx) {
403 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
409 arp->addr = vnn->public_address;
412 tcparray = vnn->tcp_array;
414 /* add all of the known tcp connections for this IP to the
415 list of tcp connections to send tickle acks for */
416 arp->tcparray = talloc_steal(arp, tcparray);
418 vnn->tcp_array = NULL;
419 vnn->tcp_update_needed = true;
422 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
423 timeval_zero(), ctdb_control_send_arp, arp);
428 struct takeover_callback_state {
429 struct ctdb_req_control_old *c;
430 ctdb_sock_addr *addr;
431 struct ctdb_vnn *vnn;
434 struct ctdb_do_takeip_state {
435 struct ctdb_req_control_old *c;
436 struct ctdb_vnn *vnn;
440 called when takeip event finishes
442 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
445 struct ctdb_do_takeip_state *state =
446 talloc_get_type(private_data, struct ctdb_do_takeip_state);
451 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
453 if (status == -ETIME) {
456 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
457 ctdb_addr_to_str(&state->vnn->public_address),
458 ctdb_vnn_iface_string(state->vnn)));
459 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
461 node->flags |= NODE_FLAGS_UNHEALTHY;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
539 ctdb_do_takeip_callback,
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
575 if (status == -ETIME) {
578 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
579 ctdb_addr_to_str(&state->vnn->public_address),
581 ctdb_vnn_iface_string(state->vnn)));
584 * All we can do is reset the old interface
585 * and let the next run fix it
587 ctdb_vnn_unassign_iface(ctdb, state->vnn);
588 state->vnn->iface = state->old;
589 state->vnn->iface->references++;
591 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
596 if (ctdb->do_checkpublicip) {
598 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
607 /* the control succeeded */
608 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
613 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
615 state->vnn->update_in_flight = false;
620 update (move) an ip address
622 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
623 struct ctdb_req_control_old *c,
624 struct ctdb_vnn *vnn)
627 struct ctdb_do_updateip_state *state;
628 struct ctdb_interface *old = vnn->iface;
629 const char *new_name;
631 if (vnn->update_in_flight) {
632 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
633 "update for this IP already in flight\n",
634 ctdb_addr_to_str(&vnn->public_address),
635 vnn->public_netmask_bits));
639 ctdb_vnn_unassign_iface(ctdb, vnn);
640 ret = ctdb_vnn_assign_iface(ctdb, vnn);
642 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
643 "assin a usable interface (old iface '%s')\n",
644 ctdb_addr_to_str(&vnn->public_address),
645 vnn->public_netmask_bits,
650 new_name = ctdb_vnn_iface_string(vnn);
651 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
652 /* A benign update from one interface onto itself.
653 * no need to run the eventscripts in this case, just return
656 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 state = talloc(vnn, struct ctdb_do_updateip_state);
661 CTDB_NO_MEMORY(ctdb, state);
663 state->c = talloc_steal(ctdb, c);
667 vnn->update_in_flight = true;
668 talloc_set_destructor(state, ctdb_updateip_destructor);
670 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
671 "interface %s to %s\n",
672 ctdb_addr_to_str(&vnn->public_address),
673 vnn->public_netmask_bits,
677 ret = ctdb_event_script_callback(ctdb,
679 ctdb_do_updateip_callback,
681 CTDB_EVENT_UPDATE_IP,
685 ctdb_addr_to_str(&vnn->public_address),
686 vnn->public_netmask_bits);
688 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
689 ctdb_addr_to_str(&vnn->public_address),
690 old->name, new_name));
699 Find the vnn of the node that has a public ip address
700 returns -1 if the address is not known as a public address
702 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
704 struct ctdb_vnn *vnn;
706 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
707 if (ctdb_same_ip(&vnn->public_address, addr)) {
716 take over an ip address
718 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
719 struct ctdb_req_control_old *c,
724 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
725 struct ctdb_vnn *vnn;
726 bool have_ip = false;
727 bool do_updateip = false;
728 bool do_takeip = false;
729 struct ctdb_interface *best_iface = NULL;
731 if (pip->pnn != ctdb->pnn) {
732 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
733 "with pnn %d, but we're node %d\n",
734 ctdb_addr_to_str(&pip->addr),
735 pip->pnn, ctdb->pnn));
739 /* update out vnn list */
740 vnn = find_public_ip_vnn(ctdb, &pip->addr);
742 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
743 ctdb_addr_to_str(&pip->addr)));
747 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
748 have_ip = ctdb_sys_have_ip(&pip->addr);
750 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
751 if (best_iface == NULL) {
752 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
753 "a usable interface (old %s, have_ip %d)\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 vnn->public_netmask_bits,
756 ctdb_vnn_iface_string(vnn),
761 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
762 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
767 if (vnn->iface == NULL && have_ip) {
768 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
769 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
770 ctdb_addr_to_str(&vnn->public_address)));
774 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
775 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
776 "and we have it on iface[%s], but it was assigned to node %d"
777 "and we are node %d, banning ourself\n",
778 ctdb_addr_to_str(&vnn->public_address),
779 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
784 if (vnn->pnn == -1 && have_ip) {
785 vnn->pnn = ctdb->pnn;
786 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
787 "and we already have it on iface[%s], update local daemon\n",
788 ctdb_addr_to_str(&vnn->public_address),
789 ctdb_vnn_iface_string(vnn)));
794 if (vnn->iface != best_iface) {
795 if (!vnn->iface->link_up) {
797 } else if (vnn->iface->references > (best_iface->references + 1)) {
798 /* only move when the rebalance gains something */
806 ctdb_vnn_unassign_iface(ctdb, vnn);
813 ret = ctdb_do_takeip(ctdb, c, vnn);
817 } else if (do_updateip) {
818 ret = ctdb_do_updateip(ctdb, c, vnn);
824 * The interface is up and the kernel known the ip
827 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
828 ctdb_addr_to_str(&pip->addr),
829 vnn->public_netmask_bits,
830 ctdb_vnn_iface_string(vnn)));
834 /* tell ctdb_control.c that we will be replying asynchronously */
841 kill any clients that are registered with a IP that is being released
843 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
845 struct ctdb_client_ip *ip;
847 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
848 ctdb_addr_to_str(addr)));
850 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
851 ctdb_sock_addr tmp_addr;
854 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
856 ctdb_addr_to_str(&ip->addr)));
858 if (ctdb_same_ip(&tmp_addr, addr)) {
859 struct ctdb_client *client = reqid_find(ctdb->idr,
862 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
864 ctdb_addr_to_str(&ip->addr),
867 if (client->pid != 0) {
868 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
869 (unsigned)client->pid,
870 ctdb_addr_to_str(addr),
872 kill(client->pid, SIGKILL);
878 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
880 DLIST_REMOVE(ctdb->vnn, vnn);
881 ctdb_vnn_unassign_iface(ctdb, vnn);
882 ctdb_remove_orphaned_ifaces(ctdb, vnn);
887 called when releaseip event finishes
889 static void release_ip_callback(struct ctdb_context *ctdb, int status,
892 struct takeover_callback_state *state =
893 talloc_get_type(private_data, struct takeover_callback_state);
896 if (status == -ETIME) {
900 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
901 if (ctdb_sys_have_ip(state->addr)) {
903 ("IP %s still hosted during release IP callback, failing\n",
904 ctdb_addr_to_str(state->addr)));
905 ctdb_request_control_reply(ctdb, state->c,
912 /* send a message to all clients of this node telling them
913 that the cluster has been reconfigured and they should
914 release any sockets on this IP */
915 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
916 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
917 data.dsize = strlen((char *)data.dptr)+1;
919 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
921 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
923 /* kill clients that have registered with this IP */
924 release_kill_clients(ctdb, state->addr);
926 ctdb_vnn_unassign_iface(ctdb, state->vnn);
928 /* Process the IP if it has been marked for deletion */
929 if (state->vnn->delete_pending) {
930 do_delete_ip(ctdb, state->vnn);
934 /* the control succeeded */
935 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
939 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
941 if (state->vnn != NULL) {
942 state->vnn->update_in_flight = false;
948 release an ip address
950 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
951 struct ctdb_req_control_old *c,
956 struct takeover_callback_state *state;
957 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
958 struct ctdb_vnn *vnn;
961 /* update our vnn list */
962 vnn = find_public_ip_vnn(ctdb, &pip->addr);
964 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
965 ctdb_addr_to_str(&pip->addr)));
970 /* stop any previous arps */
971 talloc_free(vnn->takeover_ctx);
972 vnn->takeover_ctx = NULL;
974 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
975 * lazy multicast to drop an IP from any node that isn't the
976 * intended new node. The following causes makes ctdbd ignore
977 * a release for any address it doesn't host.
979 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
980 if (!ctdb_sys_have_ip(&pip->addr)) {
981 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
982 ctdb_addr_to_str(&pip->addr),
983 vnn->public_netmask_bits,
984 ctdb_vnn_iface_string(vnn)));
985 ctdb_vnn_unassign_iface(ctdb, vnn);
989 if (vnn->iface == NULL) {
990 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
991 ctdb_addr_to_str(&pip->addr),
992 vnn->public_netmask_bits));
997 /* There is a potential race between take_ip and us because we
998 * update the VNN via a callback that run when the
999 * eventscripts have been run. Avoid the race by allowing one
1000 * update to be in flight at a time.
1002 if (vnn->update_in_flight) {
1003 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1004 "update for this IP already in flight\n",
1005 ctdb_addr_to_str(&vnn->public_address),
1006 vnn->public_netmask_bits));
1010 iface = strdup(ctdb_vnn_iface_string(vnn));
1012 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 vnn->public_netmask_bits,
1018 state = talloc(ctdb, struct takeover_callback_state);
1019 if (state == NULL) {
1020 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1021 __FILE__, __LINE__);
1026 state->c = talloc_steal(state, c);
1027 state->addr = talloc(state, ctdb_sock_addr);
1028 if (state->addr == NULL) {
1029 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1030 __FILE__, __LINE__);
1035 *state->addr = pip->addr;
1038 vnn->update_in_flight = true;
1039 talloc_set_destructor(state, ctdb_releaseip_destructor);
1041 ret = ctdb_event_script_callback(ctdb,
1042 state, release_ip_callback, state,
1043 CTDB_EVENT_RELEASE_IP,
1046 ctdb_addr_to_str(&pip->addr),
1047 vnn->public_netmask_bits);
1050 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1051 ctdb_addr_to_str(&pip->addr),
1052 ctdb_vnn_iface_string(vnn)));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1062 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1063 ctdb_sock_addr *addr,
1064 unsigned mask, const char *ifaces,
1067 struct ctdb_vnn *vnn;
1074 tmp = strdup(ifaces);
1075 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1076 if (!ctdb_sys_check_iface_exists(iface)) {
1077 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1084 /* Verify that we don't have an entry for this ip yet */
1085 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1086 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1087 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1088 ctdb_addr_to_str(addr)));
1093 /* create a new vnn structure for this ip address */
1094 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1095 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1096 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1097 tmp = talloc_strdup(vnn, ifaces);
1098 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1101 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1102 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1103 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1107 vnn->ifaces[num] = NULL;
1108 vnn->public_address = *addr;
1109 vnn->public_netmask_bits = mask;
1111 if (check_address) {
1112 if (ctdb_sys_have_ip(addr)) {
1113 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1114 vnn->pnn = ctdb->pnn;
1118 for (i=0; vnn->ifaces[i]; i++) {
1119 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1121 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1122 "for public_address[%s]\n",
1123 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1129 DLIST_ADD(ctdb->vnn, vnn);
1135 setup the public address lists from a file
1137 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1143 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1144 if (lines == NULL) {
1145 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1148 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1152 for (i=0;i<nlines;i++) {
1154 ctdb_sock_addr addr;
1155 const char *addrstr;
1160 while ((*line == ' ') || (*line == '\t')) {
1166 if (strcmp(line, "") == 0) {
1169 tok = strtok(line, " \t");
1171 tok = strtok(NULL, " \t");
1173 if (NULL == ctdb->default_public_interface) {
1174 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1179 ifaces = ctdb->default_public_interface;
1184 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1185 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1189 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1190 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1201 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1205 struct ctdb_vnn *svnn;
1206 struct ctdb_interface *cur = NULL;
1210 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1211 CTDB_NO_MEMORY(ctdb, svnn);
1213 svnn->ifaces = talloc_array(svnn, const char *, 2);
1214 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1215 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1216 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1217 svnn->ifaces[1] = NULL;
1219 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1225 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1227 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1228 "for single_ip[%s]\n",
1230 ctdb_addr_to_str(&svnn->public_address)));
1235 /* assume the single public ip interface is initially "good" */
1236 cur = ctdb_find_iface(ctdb, iface);
1238 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1241 cur->link_up = true;
1243 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1249 ctdb->single_ip_vnn = svnn;
1253 struct public_ip_list {
1254 struct public_ip_list *next;
1256 ctdb_sock_addr addr;
1259 /* Given a physical node, return the number of
1260 public addresses that is currently assigned to this node.
1262 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1266 for (;ips;ips=ips->next) {
1267 if (ips->pnn == pnn) {
1275 /* Can the given node host the given IP: is the public IP known to the
1276 * node and is NOIPHOST unset?
1278 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1280 struct ctdb_ipflags ipflags,
1281 struct public_ip_list *ip)
1283 struct ctdb_public_ip_list_old *public_ips;
1286 if (ipflags.noiphost) {
1290 public_ips = ipalloc_state->available_public_ips[pnn];
1292 if (public_ips == NULL) {
1296 for (i=0; i<public_ips->num; i++) {
1297 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1298 /* yes, this node can serve this public ip */
1306 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1308 struct ctdb_ipflags ipflags,
1309 struct public_ip_list *ip)
1311 if (ipflags.noiptakeover) {
1315 return can_node_host_ip(ipalloc_state, pnn, ipflags, ip);
1318 /* search the node lists list for a node to takeover this ip.
1319 pick the node that currently are serving the least number of ips
1320 so that the ips get spread out evenly.
1322 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1323 struct ctdb_ipflags *ipflags,
1324 struct public_ip_list *ip,
1325 struct public_ip_list *all_ips)
1327 int pnn, min=0, num;
1330 numnodes = ipalloc_state->num;
1332 for (i=0; i<numnodes; i++) {
1333 /* verify that this node can serve this ip */
1334 if (!can_node_takeover_ip(ipalloc_state, i, ipflags[i], ip)) {
1335 /* no it couldnt so skip to the next node */
1339 num = node_ip_coverage(i, all_ips);
1340 /* was this the first node we checked ? */
1352 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1353 ctdb_addr_to_str(&ip->addr)));
1363 static uint32_t *ip_key(ctdb_sock_addr *ip)
1365 static uint32_t key[IP_KEYLEN];
1367 bzero(key, sizeof(key));
1369 switch (ip->sa.sa_family) {
1371 key[3] = htonl(ip->ip.sin_addr.s_addr);
1374 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1375 key[0] = htonl(s6_a32[0]);
1376 key[1] = htonl(s6_a32[1]);
1377 key[2] = htonl(s6_a32[2]);
1378 key[3] = htonl(s6_a32[3]);
1382 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1389 static void *add_ip_callback(void *parm, void *data)
1391 struct public_ip_list *this_ip = parm;
1392 struct public_ip_list *prev_ip = data;
1394 if (prev_ip == NULL) {
1397 if (this_ip->pnn == -1) {
1398 this_ip->pnn = prev_ip->pnn;
1404 static int getips_count_callback(void *param, void *data)
1406 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1407 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1409 new_ip->next = *ip_list;
1414 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1415 struct ctdb_public_ip_list_old *ips,
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419 struct ipalloc_state *ipalloc_state,
1420 struct ctdb_node_map_old *nodemap)
1425 if (ipalloc_state->num != nodemap->num) {
1428 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1429 ipalloc_state->num, nodemap->num));
1433 for (j=0; j<nodemap->num; j++) {
1434 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1438 /* Retrieve the list of known public IPs from the node */
1439 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1444 &ipalloc_state->known_public_ips[j]);
1447 ("Failed to read known public IPs from node: %u\n",
1452 if (ctdb->do_checkpublicip) {
1453 verify_remote_ip_allocation(ctdb,
1454 ipalloc_state->known_public_ips[j],
1458 /* Retrieve the list of available public IPs from the node */
1459 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1463 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1464 &ipalloc_state->available_public_ips[j]);
1467 ("Failed to read available public IPs from node: %u\n",
1476 static struct public_ip_list *
1477 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1480 struct public_ip_list *ip_list;
1481 struct ctdb_public_ip_list_old *public_ips;
1483 TALLOC_FREE(ctdb->ip_tree);
1484 ctdb->ip_tree = trbt_create(ctdb, 0);
1486 for (i=0; i < ctdb->num_nodes; i++) {
1487 public_ips = ipalloc_state->known_public_ips[i];
1489 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1493 /* there were no public ips for this node */
1494 if (public_ips == NULL) {
1498 for (j=0; j < public_ips->num; j++) {
1499 struct public_ip_list *tmp_ip;
1501 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1502 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1503 /* Do not use information about IP addresses hosted
1504 * on other nodes, it may not be accurate */
1505 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1506 tmp_ip->pnn = public_ips->ips[j].pnn;
1510 tmp_ip->addr = public_ips->ips[j].addr;
1511 tmp_ip->next = NULL;
1513 trbt_insertarray32_callback(ctdb->ip_tree,
1514 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1521 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1527 * This is the length of the longtest common prefix between the IPs.
1528 * It is calculated by XOR-ing the 2 IPs together and counting the
1529 * number of leading zeroes. The implementation means that all
1530 * addresses end up being 128 bits long.
1532 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1533 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1534 * lots of nodes and IP addresses?
1536 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1538 uint32_t ip1_k[IP_KEYLEN];
1543 uint32_t distance = 0;
1545 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1547 for (i=0; i<IP_KEYLEN; i++) {
1548 x = ip1_k[i] ^ t[i];
1552 /* Count number of leading zeroes.
1553 * FIXME? This could be optimised...
1555 while ((x & (1 << 31)) == 0) {
1565 /* Calculate the IP distance for the given IP relative to IPs on the
1566 given node. The ips argument is generally the all_ips variable
1567 used in the main part of the algorithm.
1569 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1570 struct public_ip_list *ips,
1573 struct public_ip_list *t;
1578 for (t=ips; t != NULL; t=t->next) {
1579 if (t->pnn != pnn) {
1583 /* Optimisation: We never calculate the distance
1584 * between an address and itself. This allows us to
1585 * calculate the effect of removing an address from a
1586 * node by simply calculating the distance between
1587 * that address and all of the exitsing addresses.
1588 * Moreover, we assume that we're only ever dealing
1589 * with addresses from all_ips so we can identify an
1590 * address via a pointer rather than doing a more
1591 * expensive address comparison. */
1592 if (&(t->addr) == ip) {
1596 d = ip_distance(ip, &(t->addr));
1597 sum += d * d; /* Cheaper than pulling in math.h :-) */
1603 /* Return the LCP2 imbalance metric for addresses currently assigned
1606 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1608 struct public_ip_list *t;
1610 uint32_t imbalance = 0;
1612 for (t=all_ips; t!=NULL; t=t->next) {
1613 if (t->pnn != pnn) {
1616 /* Pass the rest of the IPs rather than the whole
1619 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1625 /* Allocate any unassigned IPs just by looping through the IPs and
1626 * finding the best node for each.
1628 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1629 struct ctdb_ipflags *ipflags,
1630 struct public_ip_list *all_ips)
1632 struct public_ip_list *tmp_ip;
1634 /* loop over all ip's and find a physical node to cover for
1637 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1638 if (tmp_ip->pnn == -1) {
1639 if (find_takeover_node(ipalloc_state, ipflags,
1641 DEBUG(DEBUG_WARNING,
1642 ("Failed to find node to cover ip %s\n",
1643 ctdb_addr_to_str(&tmp_ip->addr)));
1649 /* Basic non-deterministic rebalancing algorithm.
1651 static void basic_failback(struct ipalloc_state *ipalloc_state,
1652 struct ctdb_ipflags *ipflags,
1653 struct public_ip_list *all_ips,
1657 int maxnode, maxnum, minnode, minnum, num, retries;
1658 struct public_ip_list *tmp_ip;
1660 numnodes = ipalloc_state->num;
1667 /* for each ip address, loop over all nodes that can serve
1668 this ip and make sure that the difference between the node
1669 serving the most and the node serving the least ip's are
1672 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1673 if (tmp_ip->pnn == -1) {
1677 /* Get the highest and lowest number of ips's served by any
1678 valid node which can serve this ip.
1682 for (i=0; i<numnodes; i++) {
1683 /* only check nodes that can actually serve this ip */
1684 if (!can_node_takeover_ip(ipalloc_state, i,
1685 ipflags[i], tmp_ip)) {
1686 /* no it couldnt so skip to the next node */
1690 num = node_ip_coverage(i, all_ips);
1691 if (maxnode == -1) {
1700 if (minnode == -1) {
1710 if (maxnode == -1) {
1711 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1712 ctdb_addr_to_str(&tmp_ip->addr)));
1717 /* if the spread between the smallest and largest coverage by
1718 a node is >=2 we steal one of the ips from the node with
1719 most coverage to even things out a bit.
1720 try to do this a limited number of times since we dont
1721 want to spend too much time balancing the ip coverage.
1723 if ( (maxnum > minnum+1)
1724 && (retries < (num_ips + 5)) ){
1725 struct public_ip_list *tmp;
1727 /* Reassign one of maxnode's VNNs */
1728 for (tmp=all_ips;tmp;tmp=tmp->next) {
1729 if (tmp->pnn == maxnode) {
1730 (void)find_takeover_node(ipalloc_state,
1742 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
1743 struct ctdb_ipflags *ipflags,
1744 struct public_ip_list *all_ips,
1745 uint32_t *force_rebalance_nodes,
1746 uint32_t **lcp2_imbalances,
1747 bool **rebalance_candidates)
1750 struct public_ip_list *tmp_ip;
1752 numnodes = ipalloc_state->num;
1754 *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
1755 if (*rebalance_candidates == NULL) {
1756 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1759 *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
1760 if (*lcp2_imbalances == NULL) {
1761 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1765 for (i=0; i<numnodes; i++) {
1766 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1767 /* First step: assume all nodes are candidates */
1768 (*rebalance_candidates)[i] = true;
1771 /* 2nd step: if a node has IPs assigned then it must have been
1772 * healthy before, so we remove it from consideration. This
1773 * is overkill but is all we have because we don't maintain
1774 * state between takeover runs. An alternative would be to
1775 * keep state and invalidate it every time the recovery master
1778 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1779 if (tmp_ip->pnn != -1) {
1780 (*rebalance_candidates)[tmp_ip->pnn] = false;
1784 /* 3rd step: if a node is forced to re-balance then
1785 we allow failback onto the node */
1786 if (force_rebalance_nodes == NULL) {
1789 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1790 uint32_t pnn = force_rebalance_nodes[i];
1791 if (pnn >= numnodes) {
1793 (__location__ "unknown node %u\n", pnn));
1798 ("Forcing rebalancing of IPs to node %u\n", pnn));
1799 (*rebalance_candidates)[pnn] = true;
1805 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1806 * the IP/node combination that will cost the least.
1808 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1809 struct ctdb_ipflags *ipflags,
1810 struct public_ip_list *all_ips,
1811 uint32_t *lcp2_imbalances)
1813 struct public_ip_list *tmp_ip;
1814 int dstnode, numnodes;
1817 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1818 struct public_ip_list *minip;
1820 bool should_loop = true;
1821 bool have_unassigned = true;
1823 numnodes = ipalloc_state->num;
1825 while (have_unassigned && should_loop) {
1826 should_loop = false;
1828 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1829 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1835 /* loop over each unassigned ip. */
1836 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1837 if (tmp_ip->pnn != -1) {
1841 for (dstnode=0; dstnode<numnodes; dstnode++) {
1842 /* only check nodes that can actually takeover this ip */
1843 if (!can_node_takeover_ip(ipalloc_state,
1847 /* no it couldnt so skip to the next node */
1851 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1854 ctdb_addr_to_str(&(tmp_ip->addr)),
1856 dstimbl - lcp2_imbalances[dstnode]));
1859 if ((minnode == -1) || (dstdsum < mindsum)) {
1869 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1871 /* If we found one then assign it to the given node. */
1872 if (minnode != -1) {
1873 minip->pnn = minnode;
1874 lcp2_imbalances[minnode] = minimbl;
1875 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1876 ctdb_addr_to_str(&(minip->addr)),
1881 /* There might be a better way but at least this is clear. */
1882 have_unassigned = false;
1883 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1884 if (tmp_ip->pnn == -1) {
1885 have_unassigned = true;
1890 /* We know if we have an unassigned addresses so we might as
1893 if (have_unassigned) {
1894 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1895 if (tmp_ip->pnn == -1) {
1896 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1897 ctdb_addr_to_str(&tmp_ip->addr)));
1903 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1904 * to move IPs from, determines the best IP/destination node
1905 * combination to move from the source node.
1907 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1908 struct ctdb_ipflags *ipflags,
1909 struct public_ip_list *all_ips,
1911 uint32_t *lcp2_imbalances,
1912 bool *rebalance_candidates)
1914 int dstnode, mindstnode, numnodes;
1915 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1916 uint32_t minsrcimbl, mindstimbl;
1917 struct public_ip_list *minip;
1918 struct public_ip_list *tmp_ip;
1920 /* Find an IP and destination node that best reduces imbalance. */
1927 numnodes = ipalloc_state->num;
1929 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1930 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1931 srcnode, lcp2_imbalances[srcnode]));
1933 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1934 /* Only consider addresses on srcnode. */
1935 if (tmp_ip->pnn != srcnode) {
1939 /* What is this IP address costing the source node? */
1940 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1941 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1943 /* Consider this IP address would cost each potential
1944 * destination node. Destination nodes are limited to
1945 * those that are newly healthy, since we don't want
1946 * to do gratuitous failover of IPs just to make minor
1947 * balance improvements.
1949 for (dstnode=0; dstnode<numnodes; dstnode++) {
1950 if (!rebalance_candidates[dstnode]) {
1954 /* only check nodes that can actually takeover this ip */
1955 if (!can_node_takeover_ip(ipalloc_state, dstnode,
1956 ipflags[dstnode], tmp_ip)) {
1957 /* no it couldnt so skip to the next node */
1961 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1962 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1963 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1965 ctdb_addr_to_str(&(tmp_ip->addr)),
1968 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1969 (dstdsum < srcdsum) && \
1970 ((mindstnode == -1) || \
1971 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1974 minsrcimbl = srcimbl;
1975 mindstnode = dstnode;
1976 mindstimbl = dstimbl;
1980 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1982 if (mindstnode != -1) {
1983 /* We found a move that makes things better... */
1984 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1985 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1986 ctdb_addr_to_str(&(minip->addr)),
1987 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1990 lcp2_imbalances[srcnode] = minsrcimbl;
1991 lcp2_imbalances[mindstnode] = mindstimbl;
1992 minip->pnn = mindstnode;
2001 struct lcp2_imbalance_pnn {
2006 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2008 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2009 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2011 if (lipa->imbalance > lipb->imbalance) {
2013 } else if (lipa->imbalance == lipb->imbalance) {
2020 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2021 * node with the highest LCP2 imbalance, and then determines the best
2022 * IP/destination node combination to move from the source node.
2024 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2025 struct ctdb_ipflags *ipflags,
2026 struct public_ip_list *all_ips,
2027 uint32_t *lcp2_imbalances,
2028 bool *rebalance_candidates)
2031 struct lcp2_imbalance_pnn * lips;
2034 numnodes = ipalloc_state->num;
2037 /* Put the imbalances and nodes into an array, sort them and
2038 * iterate through candidates. Usually the 1st one will be
2039 * used, so this doesn't cost much...
2041 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2042 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2043 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2044 for (i=0; i<numnodes; i++) {
2045 lips[i].imbalance = lcp2_imbalances[i];
2047 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2049 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2050 lcp2_cmp_imbalance_pnn);
2053 for (i=0; i<numnodes; i++) {
2054 /* This means that all nodes had 0 or 1 addresses, so
2055 * can't be imbalanced.
2057 if (lips[i].imbalance == 0) {
2061 if (lcp2_failback_candidate(ipalloc_state,
2066 rebalance_candidates)) {
2078 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state,
2079 struct ctdb_ipflags *ipflags,
2080 struct public_ip_list *all_ips)
2082 struct public_ip_list *tmp_ip;
2084 /* verify that the assigned nodes can serve that public ip
2085 and set it to -1 if not
2087 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2088 if (tmp_ip->pnn == -1) {
2091 if (!can_node_host_ip(ipalloc_state, tmp_ip->pnn,
2092 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2093 /* this node can not serve this ip. */
2094 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2095 ctdb_addr_to_str(&(tmp_ip->addr)),
2102 static bool ip_alloc_deterministic_ips(struct ipalloc_state *ipalloc_state,
2103 struct ctdb_ipflags *ipflags,
2104 struct public_ip_list *all_ips)
2106 struct public_ip_list *tmp_ip;
2109 numnodes = ipalloc_state->num;
2111 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2112 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2113 * always be allocated the same way for a specific set of
2114 * available/unavailable nodes.
2117 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2118 tmp_ip->pnn = i % numnodes;
2121 /* IP failback doesn't make sense with deterministic
2122 * IPs, since the modulo step above implicitly fails
2123 * back IPs to their "home" node.
2125 if (1 == ipalloc_state->no_ip_failback) {
2126 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2129 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2131 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2133 /* No failback here! */
2138 static bool ip_alloc_nondeterministic_ips(struct ipalloc_state *ipalloc_state,
2139 struct ctdb_ipflags *ipflags,
2140 struct public_ip_list *all_ips)
2142 /* This should be pushed down into basic_failback. */
2143 struct public_ip_list *tmp_ip;
2145 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2149 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2151 basic_allocate_unassigned(ipalloc_state, ipflags, all_ips);
2153 /* If we don't want IPs to fail back then don't rebalance IPs. */
2154 if (1 == ipalloc_state->no_ip_failback) {
2158 /* Now, try to make sure the ip adresses are evenly distributed
2161 basic_failback(ipalloc_state, ipflags, all_ips, num_ips);
2166 static bool ip_alloc_lcp2(struct ipalloc_state *ipalloc_state,
2167 struct ctdb_ipflags *ipflags,
2168 struct public_ip_list *all_ips,
2169 uint32_t *force_rebalance_nodes)
2171 uint32_t *lcp2_imbalances;
2172 bool *rebalance_candidates;
2173 int numnodes, num_rebalance_candidates, i;
2176 unassign_unsuitable_ips(ipalloc_state, ipflags, all_ips);
2178 if (!lcp2_init(ipalloc_state, ipflags, all_ips,force_rebalance_nodes,
2179 &lcp2_imbalances, &rebalance_candidates)) {
2184 lcp2_allocate_unassigned(ipalloc_state, ipflags, all_ips, lcp2_imbalances);
2186 /* If we don't want IPs to fail back then don't rebalance IPs. */
2187 if (1 == ipalloc_state->no_ip_failback) {
2191 /* It is only worth continuing if we have suitable target
2192 * nodes to transfer IPs to. This check is much cheaper than
2195 numnodes = ipalloc_state->num;
2196 num_rebalance_candidates = 0;
2197 for (i=0; i<numnodes; i++) {
2198 if (rebalance_candidates[i]) {
2199 num_rebalance_candidates++;
2202 if (num_rebalance_candidates == 0) {
2206 /* Now, try to make sure the ip adresses are evenly distributed
2209 lcp2_failback(ipalloc_state, ipflags, all_ips,
2210 lcp2_imbalances, rebalance_candidates);
2216 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2220 for (i=0;i<nodemap->num;i++) {
2221 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2222 /* Found one completely healthy node */
2230 /* The calculation part of the IP allocation algorithm. */
2231 static bool ctdb_takeover_run_core(struct ipalloc_state *ipalloc_state,
2232 struct ctdb_ipflags *ipflags,
2233 struct public_ip_list *all_ips,
2234 uint32_t *force_rebalance_nodes)
2238 switch (ipalloc_state->algorithm) {
2240 ret = ip_alloc_lcp2(ipalloc_state, ipflags, all_ips,
2241 force_rebalance_nodes);
2243 case IPALLOC_DETERMINISTIC:
2244 ret = ip_alloc_deterministic_ips(ipalloc_state, ipflags, all_ips);
2246 case IPALLOC_NONDETERMINISTIC:
2247 ret = ip_alloc_nondeterministic_ips(ipalloc_state, ipflags, all_ips);
2251 /* at this point ->pnn is the node which will own each IP
2252 or -1 if there is no node that can cover this ip
2258 struct get_tunable_callback_data {
2259 const char *tunable;
2264 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2265 int32_t res, TDB_DATA outdata,
2268 struct get_tunable_callback_data *cd =
2269 (struct get_tunable_callback_data *)callback;
2273 /* Already handled in fail callback */
2277 if (outdata.dsize != sizeof(uint32_t)) {
2278 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2279 cd->tunable, pnn, (int)sizeof(uint32_t),
2280 (int)outdata.dsize));
2285 size = talloc_array_length(cd->out);
2287 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2288 cd->tunable, pnn, size));
2293 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2296 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2297 int32_t res, TDB_DATA outdata,
2300 struct get_tunable_callback_data *cd =
2301 (struct get_tunable_callback_data *)callback;
2306 ("Timed out getting tunable \"%s\" from node %d\n",
2312 DEBUG(DEBUG_WARNING,
2313 ("Tunable \"%s\" not implemented on node %d\n",
2318 ("Unexpected error getting tunable \"%s\" from node %d\n",
2324 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2325 TALLOC_CTX *tmp_ctx,
2326 struct ctdb_node_map_old *nodemap,
2327 const char *tunable,
2328 uint32_t default_value)
2331 struct ctdb_control_get_tunable *t;
2334 struct get_tunable_callback_data callback_data;
2337 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2338 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2339 for (i=0; i<nodemap->num; i++) {
2340 tvals[i] = default_value;
2343 callback_data.out = tvals;
2344 callback_data.tunable = tunable;
2345 callback_data.fatal = false;
2347 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2348 data.dptr = talloc_size(tmp_ctx, data.dsize);
2349 t = (struct ctdb_control_get_tunable *)data.dptr;
2350 t->length = strlen(tunable)+1;
2351 memcpy(t->name, tunable, t->length);
2352 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2353 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2354 nodes, 0, TAKEOVER_TIMEOUT(),
2356 get_tunable_callback,
2357 get_tunable_fail_callback,
2358 &callback_data) != 0) {
2359 if (callback_data.fatal) {
2365 talloc_free(data.dptr);
2370 /* Set internal flags for IP allocation:
2372 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2373 * Set NOIPHOST ip flag for each INACTIVE node
2374 * if all nodes are disabled:
2375 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2377 * Set NOIPHOST ip flags for disabled nodes
2379 static struct ctdb_ipflags *
2380 set_ipflags_internal(struct ipalloc_state *ipalloc_state,
2381 struct ctdb_node_map_old *nodemap,
2382 uint32_t *tval_noiptakeover,
2383 uint32_t *tval_noiphostonalldisabled)
2386 struct ctdb_ipflags *ipflags;
2388 /* Clear IP flags - implicit due to talloc_zero */
2389 ipflags = talloc_zero_array(ipalloc_state, struct ctdb_ipflags, nodemap->num);
2390 if (ipflags == NULL) {
2391 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
2395 for (i=0;i<nodemap->num;i++) {
2396 /* Can not take IPs on node with NoIPTakeover set */
2397 if (tval_noiptakeover[i] != 0) {
2398 ipflags[i].noiptakeover = true;
2401 /* Can not host IPs on INACTIVE node */
2402 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2403 ipflags[i].noiphost = true;
2407 if (all_nodes_are_disabled(nodemap)) {
2408 /* If all nodes are disabled, can not host IPs on node
2409 * with NoIPHostOnAllDisabled set
2411 for (i=0;i<nodemap->num;i++) {
2412 if (tval_noiphostonalldisabled[i] != 0) {
2413 ipflags[i].noiphost = true;
2417 /* If some nodes are not disabled, then can not host
2418 * IPs on DISABLED node
2420 for (i=0;i<nodemap->num;i++) {
2421 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2422 ipflags[i].noiphost = true;
2430 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2431 struct ipalloc_state *ipalloc_state,
2432 struct ctdb_node_map_old *nodemap)
2434 uint32_t *tval_noiptakeover;
2435 uint32_t *tval_noiphostonalldisabled;
2436 struct ctdb_ipflags *ipflags;
2439 tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2441 if (tval_noiptakeover == NULL) {
2445 tval_noiphostonalldisabled =
2446 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2447 "NoIPHostOnAllDisabled", 0);
2448 if (tval_noiphostonalldisabled == NULL) {
2449 /* Caller frees tmp_ctx */
2453 ipflags = set_ipflags_internal(ipalloc_state, nodemap,
2455 tval_noiphostonalldisabled);
2457 talloc_free(tval_noiptakeover);
2458 talloc_free(tval_noiphostonalldisabled);
2463 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2464 TALLOC_CTX *mem_ctx)
2466 struct ipalloc_state *ipalloc_state =
2467 talloc_zero(mem_ctx, struct ipalloc_state);
2468 if (ipalloc_state == NULL) {
2469 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2473 ipalloc_state->num = ctdb->num_nodes;
2474 ipalloc_state->known_public_ips =
2475 talloc_zero_array(ipalloc_state,
2476 struct ctdb_public_ip_list_old *,
2477 ipalloc_state->num);
2478 if (ipalloc_state->known_public_ips == NULL) {
2479 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2480 talloc_free(ipalloc_state);
2483 ipalloc_state->available_public_ips =
2484 talloc_zero_array(ipalloc_state,
2485 struct ctdb_public_ip_list_old *,
2486 ipalloc_state->num);
2487 if (ipalloc_state->available_public_ips == NULL) {
2488 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2489 talloc_free(ipalloc_state);
2493 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2494 ipalloc_state->algorithm = IPALLOC_LCP2;
2495 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2496 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2498 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2501 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2503 return ipalloc_state;
2506 struct iprealloc_callback_data {
2509 client_async_callback fail_callback;
2510 void *fail_callback_data;
2511 struct ctdb_node_map_old *nodemap;
2514 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2515 int32_t res, TDB_DATA outdata,
2519 struct iprealloc_callback_data *cd =
2520 (struct iprealloc_callback_data *)callback;
2522 numnodes = talloc_array_length(cd->retry_nodes);
2523 if (pnn > numnodes) {
2525 ("ipreallocated failure from node %d, "
2526 "but only %d nodes in nodemap\n",
2531 /* Can't run the "ipreallocated" event on a INACTIVE node */
2532 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2533 DEBUG(DEBUG_WARNING,
2534 ("ipreallocated failed on inactive node %d, ignoring\n",
2541 /* If the control timed out then that's a real error,
2542 * so call the real fail callback
2544 if (cd->fail_callback) {
2545 cd->fail_callback(ctdb, pnn, res, outdata,
2546 cd->fail_callback_data);
2548 DEBUG(DEBUG_WARNING,
2549 ("iprealloc timed out but no callback registered\n"));
2553 /* If not a timeout then either the ipreallocated
2554 * eventscript (or some setup) failed. This might
2555 * have failed because the IPREALLOCATED control isn't
2556 * implemented - right now there is no way of knowing
2557 * because the error codes are all folded down to -1.
2558 * Consider retrying using EVENTSCRIPT control...
2560 DEBUG(DEBUG_WARNING,
2561 ("ipreallocated failure from node %d, flagging retry\n",
2563 cd->retry_nodes[pnn] = true;
2568 struct takeover_callback_data {
2570 client_async_callback fail_callback;
2571 void *fail_callback_data;
2572 struct ctdb_node_map_old *nodemap;
2575 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2576 uint32_t node_pnn, int32_t res,
2577 TDB_DATA outdata, void *callback_data)
2579 struct takeover_callback_data *cd =
2580 talloc_get_type_abort(callback_data,
2581 struct takeover_callback_data);
2584 for (i = 0; i < cd->nodemap->num; i++) {
2585 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2590 if (i == cd->nodemap->num) {
2591 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2595 if (!cd->node_failed[i]) {
2596 cd->node_failed[i] = true;
2597 cd->fail_callback(ctdb, node_pnn, res, outdata,
2598 cd->fail_callback_data);
2603 make any IP alias changes for public addresses that are necessary
2605 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2606 uint32_t *force_rebalance_nodes,
2607 client_async_callback fail_callback, void *callback_data)
2610 struct ctdb_public_ip ip;
2612 struct public_ip_list *all_ips, *tmp_ip;
2614 struct timeval timeout;
2615 struct client_async_data *async_data;
2616 struct ctdb_client_control_state *state;
2617 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2618 struct ctdb_ipflags *ipflags;
2619 struct ipalloc_state *ipalloc_state;
2620 struct takeover_callback_data *takeover_data;
2621 struct iprealloc_callback_data iprealloc_data;
2626 * ip failover is completely disabled, just send out the
2627 * ipreallocated event.
2629 if (ctdb->tunable.disable_ip_failover != 0) {
2633 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2634 if (ipalloc_state == NULL) {
2635 talloc_free(tmp_ctx);
2639 ipflags = set_ipflags(ctdb, ipalloc_state, nodemap);
2640 if (ipflags == NULL) {
2641 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2642 talloc_free(tmp_ctx);
2646 /* Fetch known/available public IPs from each active node */
2647 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2649 talloc_free(tmp_ctx);
2653 /* Short-circuit IP allocation if no node has available IPs */
2654 can_host_ips = false;
2655 for (i=0; i < ipalloc_state->num; i++) {
2656 if (ipalloc_state->available_public_ips[i] != NULL) {
2657 can_host_ips = true;
2660 if (!can_host_ips) {
2661 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2665 /* since nodes only know about those public addresses that
2666 can be served by that particular node, no single node has
2667 a full list of all public addresses that exist in the cluster.
2668 Walk over all node structures and create a merged list of
2669 all public addresses that exist in the cluster.
2671 keep the tree of ips around as ctdb->ip_tree
2673 all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2675 /* Do the IP reassignment calculations */
2676 ctdb_takeover_run_core(ipalloc_state, ipflags,
2677 all_ips, force_rebalance_nodes);
2679 /* Now tell all nodes to release any public IPs should not
2680 * host. This will be a NOOP on nodes that don't currently
2681 * hold the given IP.
2683 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2684 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2686 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2687 bool, nodemap->num);
2688 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2689 takeover_data->fail_callback = fail_callback;
2690 takeover_data->fail_callback_data = callback_data;
2691 takeover_data->nodemap = nodemap;
2693 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2694 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2696 async_data->fail_callback = takeover_run_fail_callback;
2697 async_data->callback_data = takeover_data;
2699 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2701 /* Send a RELEASE_IP to all nodes that should not be hosting
2702 * each IP. For each IP, all but one of these will be
2703 * redundant. However, the redundant ones are used to tell
2704 * nodes which node should be hosting the IP so that commands
2705 * like "ctdb ip" can display a particular nodes idea of who
2706 * is hosting what. */
2707 for (i=0;i<nodemap->num;i++) {
2708 /* don't talk to unconnected nodes, but do talk to banned nodes */
2709 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2713 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2714 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2715 /* This node should be serving this
2716 vnn so don't tell it to release the ip
2720 ip.pnn = tmp_ip->pnn;
2721 ip.addr = tmp_ip->addr;
2723 timeout = TAKEOVER_TIMEOUT();
2724 data.dsize = sizeof(ip);
2725 data.dptr = (uint8_t *)&ip;
2726 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2727 0, CTDB_CONTROL_RELEASE_IP, 0,
2730 if (state == NULL) {
2731 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2732 talloc_free(tmp_ctx);
2736 ctdb_client_async_add(async_data, state);
2739 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2740 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2741 talloc_free(tmp_ctx);
2744 talloc_free(async_data);
2747 /* For each IP, send a TAKOVER_IP to the node that should be
2748 * hosting it. Many of these will often be redundant (since
2749 * the allocation won't have changed) but they can be useful
2750 * to recover from inconsistencies. */
2751 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2752 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2754 async_data->fail_callback = fail_callback;
2755 async_data->callback_data = callback_data;
2757 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2758 if (tmp_ip->pnn == -1) {
2759 /* this IP won't be taken over */
2763 ip.pnn = tmp_ip->pnn;
2764 ip.addr = tmp_ip->addr;
2766 timeout = TAKEOVER_TIMEOUT();
2767 data.dsize = sizeof(ip);
2768 data.dptr = (uint8_t *)&ip;
2769 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2770 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2771 data, async_data, &timeout, NULL);
2772 if (state == NULL) {
2773 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2774 talloc_free(tmp_ctx);
2778 ctdb_client_async_add(async_data, state);
2780 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2781 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2782 talloc_free(tmp_ctx);
2788 * Tell all nodes to run eventscripts to process the
2789 * "ipreallocated" event. This can do a lot of things,
2790 * including restarting services to reconfigure them if public
2791 * IPs have moved. Once upon a time this event only used to
2794 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2795 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2796 iprealloc_data.retry_nodes = retry_data;
2797 iprealloc_data.retry_count = 0;
2798 iprealloc_data.fail_callback = fail_callback;
2799 iprealloc_data.fail_callback_data = callback_data;
2800 iprealloc_data.nodemap = nodemap;
2802 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2803 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2804 nodes, 0, TAKEOVER_TIMEOUT(),
2806 NULL, iprealloc_fail_callback,
2809 /* If the control failed then we should retry to any
2810 * nodes flagged by iprealloc_fail_callback using the
2811 * EVENTSCRIPT control. This is a best-effort at
2812 * backward compatiblity when running a mixed cluster
2813 * where some nodes have not yet been upgraded to
2814 * support the IPREALLOCATED control.
2816 DEBUG(DEBUG_WARNING,
2817 ("Retry ipreallocated to some nodes using eventscript control\n"));
2819 nodes = talloc_array(tmp_ctx, uint32_t,
2820 iprealloc_data.retry_count);
2821 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2824 for (i=0; i<nodemap->num; i++) {
2825 if (iprealloc_data.retry_nodes[i]) {
2831 data.dptr = discard_const("ipreallocated");
2832 data.dsize = strlen((char *)data.dptr) + 1;
2833 ret = ctdb_client_async_control(ctdb,
2834 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2835 nodes, 0, TAKEOVER_TIMEOUT(),
2837 NULL, fail_callback,
2840 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2844 talloc_free(tmp_ctx);
2850 destroy a ctdb_client_ip structure
2852 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2854 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2855 ctdb_addr_to_str(&ip->addr),
2856 ntohs(ip->addr.ip.sin_port),
2859 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2864 called by a client to inform us of a TCP connection that it is managing
2865 that should tickled with an ACK when IP takeover is done
2867 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2870 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2871 struct ctdb_connection *tcp_sock = NULL;
2872 struct ctdb_tcp_list *tcp;
2873 struct ctdb_connection t;
2876 struct ctdb_client_ip *ip;
2877 struct ctdb_vnn *vnn;
2878 ctdb_sock_addr addr;
2880 /* If we don't have public IPs, tickles are useless */
2881 if (ctdb->vnn == NULL) {
2885 tcp_sock = (struct ctdb_connection *)indata.dptr;
2887 addr = tcp_sock->src;
2888 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2889 addr = tcp_sock->dst;
2890 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2893 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2894 vnn = find_public_ip_vnn(ctdb, &addr);
2896 switch (addr.sa.sa_family) {
2898 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2899 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2900 ctdb_addr_to_str(&addr)));
2904 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2905 ctdb_addr_to_str(&addr)));
2908 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2914 if (vnn->pnn != ctdb->pnn) {
2915 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2916 ctdb_addr_to_str(&addr),
2917 client_id, client->pid));
2918 /* failing this call will tell smbd to die */
2922 ip = talloc(client, struct ctdb_client_ip);
2923 CTDB_NO_MEMORY(ctdb, ip);
2927 ip->client_id = client_id;
2928 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2929 DLIST_ADD(ctdb->client_ip_list, ip);
2931 tcp = talloc(client, struct ctdb_tcp_list);
2932 CTDB_NO_MEMORY(ctdb, tcp);
2934 tcp->connection.src = tcp_sock->src;
2935 tcp->connection.dst = tcp_sock->dst;
2937 DLIST_ADD(client->tcp_list, tcp);
2939 t.src = tcp_sock->src;
2940 t.dst = tcp_sock->dst;
2942 data.dptr = (uint8_t *)&t;
2943 data.dsize = sizeof(t);
2945 switch (addr.sa.sa_family) {
2947 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2948 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2949 ctdb_addr_to_str(&tcp_sock->src),
2950 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2953 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2954 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2955 ctdb_addr_to_str(&tcp_sock->src),
2956 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2959 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2963 /* tell all nodes about this tcp connection */
2964 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2965 CTDB_CONTROL_TCP_ADD,
2966 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2968 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2976 find a tcp address on a list
2978 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2979 struct ctdb_connection *tcp)
2983 if (array == NULL) {
2987 for (i=0;i<array->num;i++) {
2988 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2989 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2990 return &array->connections[i];
2999 called by a daemon to inform us of a TCP connection that one of its
3000 clients managing that should tickled with an ACK when IP takeover is
3003 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3005 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
3006 struct ctdb_tcp_array *tcparray;
3007 struct ctdb_connection tcp;
3008 struct ctdb_vnn *vnn;
3010 /* If we don't have public IPs, tickles are useless */
3011 if (ctdb->vnn == NULL) {
3015 vnn = find_public_ip_vnn(ctdb, &p->dst);
3017 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3018 ctdb_addr_to_str(&p->dst)));
3024 tcparray = vnn->tcp_array;
3026 /* If this is the first tickle */
3027 if (tcparray == NULL) {
3028 tcparray = talloc(vnn, struct ctdb_tcp_array);
3029 CTDB_NO_MEMORY(ctdb, tcparray);
3030 vnn->tcp_array = tcparray;
3033 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3034 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3036 tcparray->connections[tcparray->num].src = p->src;
3037 tcparray->connections[tcparray->num].dst = p->dst;
3040 if (tcp_update_needed) {
3041 vnn->tcp_update_needed = true;
3047 /* Do we already have this tickle ?*/
3050 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3051 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3052 ctdb_addr_to_str(&tcp.dst),
3053 ntohs(tcp.dst.ip.sin_port),
3058 /* A new tickle, we must add it to the array */
3059 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3060 struct ctdb_connection,
3062 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3064 tcparray->connections[tcparray->num].src = p->src;
3065 tcparray->connections[tcparray->num].dst = p->dst;
3068 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3069 ctdb_addr_to_str(&tcp.dst),
3070 ntohs(tcp.dst.ip.sin_port),
3073 if (tcp_update_needed) {
3074 vnn->tcp_update_needed = true;
3082 called by a daemon to inform us of a TCP connection that one of its
3083 clients managing that should tickled with an ACK when IP takeover is
3086 static void ctdb_remove_connection(struct ctdb_context *ctdb, struct ctdb_connection *conn)
3088 struct ctdb_connection *tcpp;
3089 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst);
3092 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3093 ctdb_addr_to_str(&conn->dst)));
3097 /* if the array is empty we cant remove it
3098 and we don't need to do anything
3100 if (vnn->tcp_array == NULL) {
3101 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3102 ctdb_addr_to_str(&conn->dst),
3103 ntohs(conn->dst.ip.sin_port)));
3108 /* See if we know this connection
3109 if we don't know this connection then we dont need to do anything
3111 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3113 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3114 ctdb_addr_to_str(&conn->dst),
3115 ntohs(conn->dst.ip.sin_port)));
3120 /* We need to remove this entry from the array.
3121 Instead of allocating a new array and copying data to it
3122 we cheat and just copy the last entry in the existing array
3123 to the entry that is to be removed and just shring the
3126 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3127 vnn->tcp_array->num--;
3129 /* If we deleted the last entry we also need to remove the entire array
3131 if (vnn->tcp_array->num == 0) {
3132 talloc_free(vnn->tcp_array);
3133 vnn->tcp_array = NULL;
3136 vnn->tcp_update_needed = true;
3138 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3139 ctdb_addr_to_str(&conn->src),
3140 ntohs(conn->src.ip.sin_port)));
3145 called by a daemon to inform us of a TCP connection that one of its
3146 clients used are no longer needed in the tickle database
3148 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3150 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3152 /* If we don't have public IPs, tickles are useless */
3153 if (ctdb->vnn == NULL) {
3157 ctdb_remove_connection(ctdb, conn);
3164 Called when another daemon starts - causes all tickles for all
3165 public addresses we are serving to be sent to the new node on the
3166 next check. This actually causes the next scheduled call to
3167 tdb_update_tcp_tickles() to update all nodes. This is simple and
3168 doesn't require careful error handling.
3170 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3172 struct ctdb_vnn *vnn;
3174 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3175 (unsigned long) pnn));
3177 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3178 vnn->tcp_update_needed = true;
3186 called when a client structure goes away - hook to remove
3187 elements from the tcp_list in all daemons
3189 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3191 while (client->tcp_list) {
3192 struct ctdb_tcp_list *tcp = client->tcp_list;
3193 DLIST_REMOVE(client->tcp_list, tcp);
3194 ctdb_remove_connection(client->ctdb, &tcp->connection);
3199 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3201 struct ctdb_vnn *vnn;
3204 if (ctdb->tunable.disable_ip_failover == 1) {
3208 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3209 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3210 ctdb_vnn_unassign_iface(ctdb, vnn);
3217 /* Don't allow multiple releases at once. Some code,
3218 * particularly ctdb_tickle_sentenced_connections() is
3220 if (vnn->update_in_flight) {
3221 DEBUG(DEBUG_WARNING,
3223 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3224 ctdb_addr_to_str(&vnn->public_address),
3225 vnn->public_netmask_bits,
3226 ctdb_vnn_iface_string(vnn)));
3229 vnn->update_in_flight = true;
3231 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3232 ctdb_addr_to_str(&vnn->public_address),
3233 vnn->public_netmask_bits,
3234 ctdb_vnn_iface_string(vnn)));
3236 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3237 ctdb_vnn_iface_string(vnn),
3238 ctdb_addr_to_str(&vnn->public_address),
3239 vnn->public_netmask_bits);
3240 release_kill_clients(ctdb, &vnn->public_address);
3241 ctdb_vnn_unassign_iface(ctdb, vnn);
3242 vnn->update_in_flight = false;
3246 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3251 get list of public IPs
3253 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3254 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3257 struct ctdb_public_ip_list_old *ips;
3258 struct ctdb_vnn *vnn;
3259 bool only_available = false;
3261 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3262 only_available = true;
3265 /* count how many public ip structures we have */
3267 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3271 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3272 num*sizeof(struct ctdb_public_ip);
3273 ips = talloc_zero_size(outdata, len);
3274 CTDB_NO_MEMORY(ctdb, ips);
3277 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3278 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3281 ips->ips[i].pnn = vnn->pnn;
3282 ips->ips[i].addr = vnn->public_address;
3286 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3287 i*sizeof(struct ctdb_public_ip);
3289 outdata->dsize = len;
3290 outdata->dptr = (uint8_t *)ips;
3296 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3297 struct ctdb_req_control_old *c,
3302 ctdb_sock_addr *addr;
3303 struct ctdb_public_ip_info_old *info;
3304 struct ctdb_vnn *vnn;
3306 addr = (ctdb_sock_addr *)indata.dptr;
3308 vnn = find_public_ip_vnn(ctdb, addr);
3310 /* if it is not a public ip it could be our 'single ip' */
3311 if (ctdb->single_ip_vnn) {
3312 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3313 vnn = ctdb->single_ip_vnn;
3318 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3319 "'%s'not a public address\n",
3320 ctdb_addr_to_str(addr)));
3324 /* count how many public ip structures we have */
3326 for (;vnn->ifaces[num];) {
3330 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3331 num*sizeof(struct ctdb_iface);
3332 info = talloc_zero_size(outdata, len);
3333 CTDB_NO_MEMORY(ctdb, info);
3335 info->ip.addr = vnn->public_address;
3336 info->ip.pnn = vnn->pnn;
3337 info->active_idx = 0xFFFFFFFF;
3339 for (i=0; vnn->ifaces[i]; i++) {
3340 struct ctdb_interface *cur;
3342 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3344 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3348 if (vnn->iface == cur) {
3349 info->active_idx = i;
3351 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3352 info->ifaces[i].link_state = cur->link_up;
3353 info->ifaces[i].references = cur->references;
3356 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3357 i*sizeof(struct ctdb_iface);
3359 outdata->dsize = len;
3360 outdata->dptr = (uint8_t *)info;
3365 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3366 struct ctdb_req_control_old *c,
3370 struct ctdb_iface_list_old *ifaces;
3371 struct ctdb_interface *cur;
3373 /* count how many public ip structures we have */
3375 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3379 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3380 num*sizeof(struct ctdb_iface);
3381 ifaces = talloc_zero_size(outdata, len);
3382 CTDB_NO_MEMORY(ctdb, ifaces);
3385 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3386 strcpy(ifaces->ifaces[i].name, cur->name);
3387 ifaces->ifaces[i].link_state = cur->link_up;
3388 ifaces->ifaces[i].references = cur->references;
3392 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3393 i*sizeof(struct ctdb_iface);
3395 outdata->dsize = len;
3396 outdata->dptr = (uint8_t *)ifaces;
3401 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3402 struct ctdb_req_control_old *c,
3405 struct ctdb_iface *info;
3406 struct ctdb_interface *iface;
3407 bool link_up = false;
3409 info = (struct ctdb_iface *)indata.dptr;
3411 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3412 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3413 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3414 len, len, info->name));
3418 switch (info->link_state) {
3426 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3427 (unsigned int)info->link_state));
3431 if (info->references != 0) {
3432 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3433 (unsigned int)info->references));
3437 iface = ctdb_find_iface(ctdb, info->name);
3438 if (iface == NULL) {
3442 if (link_up == iface->link_up) {
3446 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3447 ("iface[%s] has changed it's link status %s => %s\n",
3449 iface->link_up?"up":"down",
3450 link_up?"up":"down"));
3452 iface->link_up = link_up;
3458 structure containing the listening socket and the list of tcp connections
3459 that the ctdb daemon is to kill
3461 struct ctdb_kill_tcp {
3462 struct ctdb_vnn *vnn;
3463 struct ctdb_context *ctdb;
3465 struct tevent_fd *fde;
3466 trbt_tree_t *connections;
3471 a tcp connection that is to be killed
3473 struct ctdb_killtcp_con {
3474 ctdb_sock_addr src_addr;
3475 ctdb_sock_addr dst_addr;
3477 struct ctdb_kill_tcp *killtcp;
3480 /* this function is used to create a key to represent this socketpair
3481 in the killtcp tree.
3482 this key is used to insert and lookup matching socketpairs that are
3483 to be tickled and RST
3485 #define KILLTCP_KEYLEN 10
3486 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3488 static uint32_t key[KILLTCP_KEYLEN];
3490 bzero(key, sizeof(key));
3492 if (src->sa.sa_family != dst->sa.sa_family) {
3493 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3497 switch (src->sa.sa_family) {
3499 key[0] = dst->ip.sin_addr.s_addr;
3500 key[1] = src->ip.sin_addr.s_addr;
3501 key[2] = dst->ip.sin_port;
3502 key[3] = src->ip.sin_port;
3505 uint32_t *dst6_addr32 =
3506 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3507 uint32_t *src6_addr32 =
3508 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3509 key[0] = dst6_addr32[3];
3510 key[1] = src6_addr32[3];
3511 key[2] = dst6_addr32[2];
3512 key[3] = src6_addr32[2];
3513 key[4] = dst6_addr32[1];
3514 key[5] = src6_addr32[1];
3515 key[6] = dst6_addr32[0];
3516 key[7] = src6_addr32[0];
3517 key[8] = dst->ip6.sin6_port;
3518 key[9] = src->ip6.sin6_port;
3522 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3530 called when we get a read event on the raw socket
3532 static void capture_tcp_handler(struct tevent_context *ev,
3533 struct tevent_fd *fde,
3534 uint16_t flags, void *private_data)
3536 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3537 struct ctdb_killtcp_con *con;
3538 ctdb_sock_addr src, dst;
3539 uint32_t ack_seq, seq;
3541 if (!(flags & TEVENT_FD_READ)) {
3545 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3546 killtcp->private_data,
3548 &ack_seq, &seq) != 0) {
3549 /* probably a non-tcp ACK packet */
3553 /* check if we have this guy in our list of connections
3556 con = trbt_lookuparray32(killtcp->connections,
3557 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3559 /* no this was some other packet we can just ignore */
3563 /* This one has been tickled !
3564 now reset him and remove him from the list.
3566 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3567 ntohs(con->dst_addr.ip.sin_port),
3568 ctdb_addr_to_str(&con->src_addr),
3569 ntohs(con->src_addr.ip.sin_port)));
3571 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3576 /* when traversing the list of all tcp connections to send tickle acks to
3577 (so that we can capture the ack coming back and kill the connection
3579 this callback is called for each connection we are currently trying to kill
3581 static int tickle_connection_traverse(void *param, void *data)
3583 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3585 /* have tried too many times, just give up */
3586 if (con->count >= 5) {
3587 /* can't delete in traverse: reparent to delete_cons */
3588 talloc_steal(param, con);
3592 /* othervise, try tickling it again */
3595 (ctdb_sock_addr *)&con->dst_addr,
3596 (ctdb_sock_addr *)&con->src_addr,
3603 called every second until all sentenced connections have been reset
3605 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3606 struct tevent_timer *te,
3607 struct timeval t, void *private_data)
3609 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3610 void *delete_cons = talloc_new(NULL);
3612 /* loop over all connections sending tickle ACKs */
3613 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3615 /* now we've finished traverse, it's safe to do deletion. */
3616 talloc_free(delete_cons);
3618 /* If there are no more connections to kill we can remove the
3619 entire killtcp structure
3621 if ( (killtcp->connections == NULL) ||
3622 (killtcp->connections->root == NULL) ) {
3623 talloc_free(killtcp);
3627 /* try tickling them again in a seconds time
3629 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3630 timeval_current_ofs(1, 0),
3631 ctdb_tickle_sentenced_connections, killtcp);
3635 destroy the killtcp structure
3637 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3639 struct ctdb_vnn *tmpvnn;
3641 /* verify that this vnn is still active */
3642 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3643 if (tmpvnn == killtcp->vnn) {
3648 if (tmpvnn == NULL) {
3652 if (killtcp->vnn->killtcp != killtcp) {
3656 killtcp->vnn->killtcp = NULL;
3662 /* nothing fancy here, just unconditionally replace any existing
3663 connection structure with the new one.
3665 don't even free the old one if it did exist, that one is talloc_stolen
3666 by the same node in the tree anyway and will be deleted when the new data
3669 static void *add_killtcp_callback(void *parm, void *data)
3675 add a tcp socket to the list of connections we want to RST
3677 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3681 ctdb_sock_addr src, dst;
3682 struct ctdb_kill_tcp *killtcp;
3683 struct ctdb_killtcp_con *con;
3684 struct ctdb_vnn *vnn;
3686 ctdb_canonicalize_ip(s, &src);
3687 ctdb_canonicalize_ip(d, &dst);
3689 vnn = find_public_ip_vnn(ctdb, &dst);
3691 vnn = find_public_ip_vnn(ctdb, &src);
3694 /* if it is not a public ip it could be our 'single ip' */
3695 if (ctdb->single_ip_vnn) {
3696 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3697 vnn = ctdb->single_ip_vnn;
3702 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3706 killtcp = vnn->killtcp;
3708 /* If this is the first connection to kill we must allocate
3711 if (killtcp == NULL) {
3712 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3713 CTDB_NO_MEMORY(ctdb, killtcp);
3716 killtcp->ctdb = ctdb;
3717 killtcp->capture_fd = -1;
3718 killtcp->connections = trbt_create(killtcp, 0);
3720 vnn->killtcp = killtcp;
3721 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3726 /* create a structure that describes this connection we want to
3727 RST and store it in killtcp->connections
3729 con = talloc(killtcp, struct ctdb_killtcp_con);
3730 CTDB_NO_MEMORY(ctdb, con);
3731 con->src_addr = src;
3732 con->dst_addr = dst;
3734 con->killtcp = killtcp;
3737 trbt_insertarray32_callback(killtcp->connections,
3738 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3739 add_killtcp_callback, con);
3742 If we don't have a socket to listen on yet we must create it
3744 if (killtcp->capture_fd == -1) {
3745 const char *iface = ctdb_vnn_iface_string(vnn);
3746 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3747 if (killtcp->capture_fd == -1) {
3748 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3749 "socket on iface '%s' for killtcp (%s)\n",
3750 iface, strerror(errno)));
3756 if (killtcp->fde == NULL) {
3757 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3758 killtcp->capture_fd,
3760 capture_tcp_handler, killtcp);
3761 tevent_fd_set_auto_close(killtcp->fde);
3763 /* We also need to set up some events to tickle all these connections
3764 until they are all reset
3766 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3767 ctdb_tickle_sentenced_connections, killtcp);
3770 /* tickle him once now */
3779 talloc_free(vnn->killtcp);
3780 vnn->killtcp = NULL;
3785 kill a TCP connection.
3787 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3789 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3791 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3795 called by a daemon to inform us of the entire list of TCP tickles for
3796 a particular public address.
3797 this control should only be sent by the node that is currently serving
3798 that public address.
3800 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3802 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3803 struct ctdb_tcp_array *tcparray;
3804 struct ctdb_vnn *vnn;
3806 /* We must at least have tickles.num or else we cant verify the size
3807 of the received data blob
3809 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3810 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3814 /* verify that the size of data matches what we expect */
3815 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3816 + sizeof(struct ctdb_connection) * list->num) {
3817 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3821 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3822 ctdb_addr_to_str(&list->addr)));
3824 vnn = find_public_ip_vnn(ctdb, &list->addr);
3826 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3827 ctdb_addr_to_str(&list->addr)));
3832 /* remove any old ticklelist we might have */
3833 talloc_free(vnn->tcp_array);
3834 vnn->tcp_array = NULL;
3836 tcparray = talloc(vnn, struct ctdb_tcp_array);
3837 CTDB_NO_MEMORY(ctdb, tcparray);
3839 tcparray->num = list->num;
3841 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3842 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3844 memcpy(tcparray->connections, &list->connections[0],
3845 sizeof(struct ctdb_connection)*tcparray->num);
3847 /* We now have a new fresh tickle list array for this vnn */
3848 vnn->tcp_array = tcparray;
3854 called to return the full list of tickles for the puclic address associated
3855 with the provided vnn
3857 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3859 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3860 struct ctdb_tickle_list_old *list;
3861 struct ctdb_tcp_array *tcparray;
3863 struct ctdb_vnn *vnn;
3865 vnn = find_public_ip_vnn(ctdb, addr);
3867 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3868 ctdb_addr_to_str(addr)));
3873 tcparray = vnn->tcp_array;
3875 num = tcparray->num;
3880 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3881 + sizeof(struct ctdb_connection) * num;
3883 outdata->dptr = talloc_size(outdata, outdata->dsize);
3884 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3885 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3890 memcpy(&list->connections[0], tcparray->connections,
3891 sizeof(struct ctdb_connection) * num);
3899 set the list of all tcp tickles for a public address
3901 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3902 ctdb_sock_addr *addr,
3903 struct ctdb_tcp_array *tcparray)
3907 struct ctdb_tickle_list_old *list;
3910 num = tcparray->num;
3915 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3916 sizeof(struct ctdb_connection) * num;
3917 data.dptr = talloc_size(ctdb, data.dsize);
3918 CTDB_NO_MEMORY(ctdb, data.dptr);
3920 list = (struct ctdb_tickle_list_old *)data.dptr;
3924 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3927 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3928 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3929 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3931 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3935 talloc_free(data.dptr);
3942 perform tickle updates if required
3944 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3945 struct tevent_timer *te,
3946 struct timeval t, void *private_data)
3948 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3950 struct ctdb_vnn *vnn;
3952 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3953 /* we only send out updates for public addresses that
3956 if (ctdb->pnn != vnn->pnn) {
3959 /* We only send out the updates if we need to */
3960 if (!vnn->tcp_update_needed) {
3963 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3964 &vnn->public_address,
3967 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3968 ctdb_addr_to_str(&vnn->public_address)));
3971 ("Sent tickle update for public address %s\n",
3972 ctdb_addr_to_str(&vnn->public_address)));
3973 vnn->tcp_update_needed = false;
3977 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3978 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3979 ctdb_update_tcp_tickles, ctdb);
3983 start periodic update of tcp tickles
3985 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3987 ctdb->tickle_update_context = talloc_new(ctdb);
3989 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3990 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3991 ctdb_update_tcp_tickles, ctdb);
3997 struct control_gratious_arp {
3998 struct ctdb_context *ctdb;
3999 ctdb_sock_addr addr;
4005 send a control_gratuitous arp
4007 static void send_gratious_arp(struct tevent_context *ev,
4008 struct tevent_timer *te,
4009 struct timeval t, void *private_data)
4012 struct control_gratious_arp *arp = talloc_get_type(private_data,
4013 struct control_gratious_arp);
4015 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4017 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4018 arp->iface, strerror(errno)));
4023 if (arp->count == CTDB_ARP_REPEAT) {
4028 tevent_add_timer(arp->ctdb->ev, arp,
4029 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4030 send_gratious_arp, arp);
4037 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4039 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4040 struct control_gratious_arp *arp;
4042 /* verify the size of indata */
4043 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4044 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4045 (unsigned)indata.dsize,
4046 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4050 ( offsetof(struct ctdb_addr_info_old, iface)
4051 + gratious_arp->len ) ){
4053 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4054 "but should be %u bytes\n",
4055 (unsigned)indata.dsize,
4056 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4061 arp = talloc(ctdb, struct control_gratious_arp);
4062 CTDB_NO_MEMORY(ctdb, arp);
4065 arp->addr = gratious_arp->addr;
4066 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4067 CTDB_NO_MEMORY(ctdb, arp->iface);
4070 tevent_add_timer(arp->ctdb->ev, arp,
4071 timeval_zero(), send_gratious_arp, arp);
4076 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4078 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4081 /* verify the size of indata */
4082 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4083 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4087 ( offsetof(struct ctdb_addr_info_old, iface)
4090 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4091 "but should be %u bytes\n",
4092 (unsigned)indata.dsize,
4093 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4097 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4099 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4102 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4109 struct delete_ip_callback_state {
4110 struct ctdb_req_control_old *c;
4114 called when releaseip event finishes for del_public_address
4116 static void delete_ip_callback(struct ctdb_context *ctdb,
4117 int32_t status, TDB_DATA data,
4118 const char *errormsg,
4121 struct delete_ip_callback_state *state =
4122 talloc_get_type(private_data, struct delete_ip_callback_state);
4124 /* If release failed then fail. */
4125 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4126 talloc_free(private_data);
4129 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4130 struct ctdb_req_control_old *c,
4131 TDB_DATA indata, bool *async_reply)
4133 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4134 struct ctdb_vnn *vnn;
4136 /* verify the size of indata */
4137 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4138 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4142 ( offsetof(struct ctdb_addr_info_old, iface)
4145 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4146 "but should be %u bytes\n",
4147 (unsigned)indata.dsize,
4148 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4152 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4154 /* walk over all public addresses until we find a match */
4155 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4156 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4157 if (vnn->pnn == ctdb->pnn) {
4158 struct delete_ip_callback_state *state;
4159 struct ctdb_public_ip *ip;
4163 vnn->delete_pending = true;
4165 state = talloc(ctdb,
4166 struct delete_ip_callback_state);
4167 CTDB_NO_MEMORY(ctdb, state);
4170 ip = talloc(state, struct ctdb_public_ip);
4173 (__location__ " Out of memory\n"));
4178 ip->addr = pub->addr;
4180 data.dsize = sizeof(struct ctdb_public_ip);
4181 data.dptr = (unsigned char *)ip;
4183 ret = ctdb_daemon_send_control(ctdb,
4186 CTDB_CONTROL_RELEASE_IP,
4193 (__location__ "Unable to send "
4194 "CTDB_CONTROL_RELEASE_IP\n"));
4199 state->c = talloc_steal(state, c);
4200 *async_reply = true;
4202 /* This IP is not hosted on the
4203 * current node so just delete it
4205 do_delete_ip(ctdb, vnn);
4212 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4213 ctdb_addr_to_str(&pub->addr)));
4218 struct ipreallocated_callback_state {
4219 struct ctdb_req_control_old *c;
4222 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4223 int status, void *p)
4225 struct ipreallocated_callback_state *state =
4226 talloc_get_type(p, struct ipreallocated_callback_state);
4230 (" \"ipreallocated\" event script failed (status %d)\n",
4232 if (status == -ETIME) {
4233 ctdb_ban_self(ctdb);
4237 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4241 /* A control to run the ipreallocated event */
4242 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4243 struct ctdb_req_control_old *c,
4247 struct ipreallocated_callback_state *state;
4249 state = talloc(ctdb, struct ipreallocated_callback_state);
4250 CTDB_NO_MEMORY(ctdb, state);
4252 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4254 ret = ctdb_event_script_callback(ctdb, state,
4255 ctdb_ipreallocated_callback, state,
4256 CTDB_EVENT_IPREALLOCATED,
4260 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4265 /* tell the control that we will be reply asynchronously */
4266 state->c = talloc_steal(state, c);
4267 *async_reply = true;
4273 /* This function is called from the recovery daemon to verify that a remote
4274 node has the expected ip allocation.
4275 This is verified against ctdb->ip_tree
4277 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4278 struct ctdb_public_ip_list_old *ips,
4281 struct public_ip_list *tmp_ip;
4284 if (ctdb->ip_tree == NULL) {
4285 /* don't know the expected allocation yet, assume remote node
4294 for (i=0; i<ips->num; i++) {
4295 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4296 if (tmp_ip == NULL) {
4297 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4301 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4305 if (tmp_ip->pnn != ips->ips[i].pnn) {
4307 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4309 ctdb_addr_to_str(&ips->ips[i].addr),
4310 ips->ips[i].pnn, tmp_ip->pnn));
4318 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4320 struct public_ip_list *tmp_ip;
4322 /* IP tree is never built if DisableIPFailover is set */
4323 if (ctdb->tunable.disable_ip_failover != 0) {
4327 if (ctdb->ip_tree == NULL) {
4328 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4332 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4333 if (tmp_ip == NULL) {
4334 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4338 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4339 tmp_ip->pnn = ip->pnn;
4344 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4346 TALLOC_FREE(ctdb->ip_tree);
4349 struct ctdb_reloadips_handle {
4350 struct ctdb_context *ctdb;
4351 struct ctdb_req_control_old *c;
4355 struct tevent_fd *fde;
4358 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4360 if (h == h->ctdb->reload_ips) {
4361 h->ctdb->reload_ips = NULL;
4364 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4367 ctdb_kill(h->ctdb, h->child, SIGKILL);
4371 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4372 struct tevent_timer *te,
4373 struct timeval t, void *private_data)
4375 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4380 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4381 struct tevent_fd *fde,
4382 uint16_t flags, void *private_data)
4384 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4389 ret = sys_read(h->fd[0], &res, 1);
4390 if (ret < 1 || res != 0) {
4391 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4399 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4401 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4402 struct ctdb_public_ip_list_old *ips;
4403 struct ctdb_vnn *vnn;
4404 struct client_async_data *async_data;
4405 struct timeval timeout;
4407 struct ctdb_client_control_state *state;
4411 CTDB_NO_MEMORY(ctdb, mem_ctx);
4413 /* Read IPs from local node */
4414 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4415 CTDB_CURRENT_NODE, mem_ctx, &ips);
4418 ("Unable to fetch public IPs from local node\n"));
4419 talloc_free(mem_ctx);
4423 /* Read IPs file - this is safe since this is a child process */
4425 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4426 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4427 talloc_free(mem_ctx);
4431 async_data = talloc_zero(mem_ctx, struct client_async_data);
4432 CTDB_NO_MEMORY(ctdb, async_data);
4434 /* Compare IPs between node and file for IPs to be deleted */
4435 for (i = 0; i < ips->num; i++) {
4437 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4438 if (ctdb_same_ip(&vnn->public_address,
4439 &ips->ips[i].addr)) {
4440 /* IP is still in file */
4446 /* Delete IP ips->ips[i] */
4447 struct ctdb_addr_info_old *pub;
4450 ("IP %s no longer configured, deleting it\n",
4451 ctdb_addr_to_str(&ips->ips[i].addr)));
4453 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4454 CTDB_NO_MEMORY(ctdb, pub);
4456 pub->addr = ips->ips[i].addr;
4460 timeout = TAKEOVER_TIMEOUT();
4462 data.dsize = offsetof(struct ctdb_addr_info_old,
4464 data.dptr = (uint8_t *)pub;
4466 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4467 CTDB_CONTROL_DEL_PUBLIC_IP,
4468 0, data, async_data,
4470 if (state == NULL) {
4473 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4477 ctdb_client_async_add(async_data, state);
4481 /* Compare IPs between node and file for IPs to be added */
4483 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4484 for (i = 0; i < ips->num; i++) {
4485 if (ctdb_same_ip(&vnn->public_address,
4486 &ips->ips[i].addr)) {
4487 /* IP already on node */
4491 if (i == ips->num) {
4492 /* Add IP ips->ips[i] */
4493 struct ctdb_addr_info_old *pub;
4494 const char *ifaces = NULL;
4499 ("New IP %s configured, adding it\n",
4500 ctdb_addr_to_str(&vnn->public_address)));
4502 uint32_t pnn = ctdb_get_pnn(ctdb);
4504 data.dsize = sizeof(pnn);
4505 data.dptr = (uint8_t *)&pnn;
4507 ret = ctdb_client_send_message(
4509 CTDB_BROADCAST_CONNECTED,
4510 CTDB_SRVID_REBALANCE_NODE,
4513 DEBUG(DEBUG_WARNING,
4514 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4520 ifaces = vnn->ifaces[0];
4522 while (vnn->ifaces[iface] != NULL) {
4523 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4524 vnn->ifaces[iface]);
4528 len = strlen(ifaces) + 1;
4529 pub = talloc_zero_size(mem_ctx,
4530 offsetof(struct ctdb_addr_info_old, iface) + len);
4531 CTDB_NO_MEMORY(ctdb, pub);
4533 pub->addr = vnn->public_address;
4534 pub->mask = vnn->public_netmask_bits;
4536 memcpy(&pub->iface[0], ifaces, pub->len);
4538 timeout = TAKEOVER_TIMEOUT();
4540 data.dsize = offsetof(struct ctdb_addr_info_old,
4542 data.dptr = (uint8_t *)pub;
4544 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4545 CTDB_CONTROL_ADD_PUBLIC_IP,
4546 0, data, async_data,
4548 if (state == NULL) {
4551 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4555 ctdb_client_async_add(async_data, state);
4559 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4560 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4564 talloc_free(mem_ctx);
4568 talloc_free(mem_ctx);
4572 /* This control is sent to force the node to re-read the public addresses file
4573 and drop any addresses we should nnot longer host, and add new addresses
4574 that we are now able to host
4576 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4578 struct ctdb_reloadips_handle *h;
4579 pid_t parent = getpid();
4581 if (ctdb->reload_ips != NULL) {
4582 talloc_free(ctdb->reload_ips);
4583 ctdb->reload_ips = NULL;
4586 h = talloc(ctdb, struct ctdb_reloadips_handle);
4587 CTDB_NO_MEMORY(ctdb, h);
4592 if (pipe(h->fd) == -1) {
4593 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4598 h->child = ctdb_fork(ctdb);
4599 if (h->child == (pid_t)-1) {
4600 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4608 if (h->child == 0) {
4609 signed char res = 0;
4612 debug_extra = talloc_asprintf(NULL, "reloadips:");
4614 prctl_set_comment("ctdb_reloadips");
4615 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4616 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4619 res = ctdb_reloadips_child(ctdb);
4621 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4625 sys_write(h->fd[1], &res, 1);
4626 /* make sure we die when our parent dies */
4627 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4633 h->c = talloc_steal(h, c);
4636 set_close_on_exec(h->fd[0]);
4638 talloc_set_destructor(h, ctdb_reloadips_destructor);
4641 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4642 ctdb_reloadips_child_handler, (void *)h);
4643 tevent_fd_set_auto_close(h->fde);
4645 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4646 ctdb_reloadips_timeout_event, h);
4648 /* we reply later */
4649 *async_reply = true;