4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47 #define CTDB_ARP_INTERVAL 1
48 #define CTDB_ARP_REPEAT 3
50 /* Flags used in IP allocation algorithms. */
51 enum ipalloc_algorithm {
52 IPALLOC_DETERMINISTIC,
53 IPALLOC_NONDETERMINISTIC,
57 struct ipalloc_state {
60 /* Arrays with data for each node */
61 struct ctdb_public_ip_list_old **known_public_ips;
62 struct ctdb_public_ip_list_old **available_public_ips;
66 struct public_ip_list *all_ips;
67 enum ipalloc_algorithm algorithm;
68 uint32_t no_ip_failback;
69 uint32_t *force_rebalance_nodes;
72 struct ctdb_interface {
73 struct ctdb_interface *prev, *next;
79 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
82 return vnn->iface->name;
88 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
90 struct ctdb_interface *i;
92 /* Verify that we don't have an entry for this ip yet */
93 for (i=ctdb->ifaces;i;i=i->next) {
94 if (strcmp(i->name, iface) == 0) {
99 /* create a new structure for this interface */
100 i = talloc_zero(ctdb, struct ctdb_interface);
101 CTDB_NO_MEMORY_FATAL(ctdb, i);
102 i->name = talloc_strdup(i, iface);
103 CTDB_NO_MEMORY(ctdb, i->name);
107 DLIST_ADD(ctdb->ifaces, i);
112 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
117 for (n = 0; vnn->ifaces[n] != NULL; n++) {
118 if (strcmp(name, vnn->ifaces[n]) == 0) {
126 /* If any interfaces now have no possible IPs then delete them. This
127 * implementation is naive (i.e. simple) rather than clever
128 * (i.e. complex). Given that this is run on delip and that operation
129 * is rare, this doesn't need to be efficient - it needs to be
130 * foolproof. One alternative is reference counting, where the logic
131 * is distributed and can, therefore, be broken in multiple places.
132 * Another alternative is to build a red-black tree of interfaces that
133 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
134 * once) and then walking ctdb->ifaces once and deleting those not in
135 * the tree. Let's go to one of those if the naive implementation
136 * causes problems... :-)
138 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
139 struct ctdb_vnn *vnn)
141 struct ctdb_interface *i, *next;
143 /* For each interface, check if there's an IP using it. */
144 for (i = ctdb->ifaces; i != NULL; i = next) {
149 /* Only consider interfaces named in the given VNN. */
150 if (!vnn_has_interface_with_name(vnn, i->name)) {
154 /* Is the "single IP" on this interface? */
155 if ((ctdb->single_ip_vnn != NULL) &&
156 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
157 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
158 /* Found, next interface please... */
161 /* Search for a vnn with this interface. */
163 for (tv=ctdb->vnn; tv; tv=tv->next) {
164 if (vnn_has_interface_with_name(tv, i->name)) {
171 /* None of the VNNs are using this interface. */
172 DLIST_REMOVE(ctdb->ifaces, i);
179 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
182 struct ctdb_interface *i;
184 for (i=ctdb->ifaces;i;i=i->next) {
185 if (strcmp(i->name, iface) == 0) {
193 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
194 struct ctdb_vnn *vnn)
197 struct ctdb_interface *cur = NULL;
198 struct ctdb_interface *best = NULL;
200 for (i=0; vnn->ifaces[i]; i++) {
202 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
216 if (cur->references < best->references) {
225 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
226 struct ctdb_vnn *vnn)
228 struct ctdb_interface *best = NULL;
231 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
232 "still assigned to iface '%s'\n",
233 ctdb_addr_to_str(&vnn->public_address),
234 ctdb_vnn_iface_string(vnn)));
238 best = ctdb_vnn_best_iface(ctdb, vnn);
240 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
241 "cannot assign to iface any iface\n",
242 ctdb_addr_to_str(&vnn->public_address)));
248 vnn->pnn = ctdb->pnn;
250 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
251 "now assigned to iface '%s' refs[%d]\n",
252 ctdb_addr_to_str(&vnn->public_address),
253 ctdb_vnn_iface_string(vnn),
258 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
259 struct ctdb_vnn *vnn)
261 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
262 "now unassigned (old iface '%s' refs[%d])\n",
263 ctdb_addr_to_str(&vnn->public_address),
264 ctdb_vnn_iface_string(vnn),
265 vnn->iface?vnn->iface->references:0));
267 vnn->iface->references--;
270 if (vnn->pnn == ctdb->pnn) {
275 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
276 struct ctdb_vnn *vnn)
280 /* Nodes that are not RUNNING can not host IPs */
281 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
285 if (vnn->delete_pending) {
289 if (vnn->iface && vnn->iface->link_up) {
293 for (i=0; vnn->ifaces[i]; i++) {
294 struct ctdb_interface *cur;
296 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
309 struct ctdb_takeover_arp {
310 struct ctdb_context *ctdb;
313 struct ctdb_tcp_array *tcparray;
314 struct ctdb_vnn *vnn;
319 lists of tcp endpoints
321 struct ctdb_tcp_list {
322 struct ctdb_tcp_list *prev, *next;
323 struct ctdb_connection connection;
327 list of clients to kill on IP release
329 struct ctdb_client_ip {
330 struct ctdb_client_ip *prev, *next;
331 struct ctdb_context *ctdb;
338 send a gratuitous arp
340 static void ctdb_control_send_arp(struct tevent_context *ev,
341 struct tevent_timer *te,
342 struct timeval t, void *private_data)
344 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
345 struct ctdb_takeover_arp);
347 struct ctdb_tcp_array *tcparray;
348 const char *iface = ctdb_vnn_iface_string(arp->vnn);
350 ret = ctdb_sys_send_arp(&arp->addr, iface);
352 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
353 iface, strerror(errno)));
356 tcparray = arp->tcparray;
358 for (i=0;i<tcparray->num;i++) {
359 struct ctdb_connection *tcon;
361 tcon = &tcparray->connections[i];
362 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
363 (unsigned)ntohs(tcon->dst.ip.sin_port),
364 ctdb_addr_to_str(&tcon->src),
365 (unsigned)ntohs(tcon->src.ip.sin_port)));
366 ret = ctdb_sys_send_tcp(
371 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
372 ctdb_addr_to_str(&tcon->src)));
379 if (arp->count == CTDB_ARP_REPEAT) {
384 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
385 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
386 ctdb_control_send_arp, arp);
389 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
390 struct ctdb_vnn *vnn)
392 struct ctdb_takeover_arp *arp;
393 struct ctdb_tcp_array *tcparray;
395 if (!vnn->takeover_ctx) {
396 vnn->takeover_ctx = talloc_new(vnn);
397 if (!vnn->takeover_ctx) {
402 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
408 arp->addr = vnn->public_address;
411 tcparray = vnn->tcp_array;
413 /* add all of the known tcp connections for this IP to the
414 list of tcp connections to send tickle acks for */
415 arp->tcparray = talloc_steal(arp, tcparray);
417 vnn->tcp_array = NULL;
418 vnn->tcp_update_needed = true;
421 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
422 timeval_zero(), ctdb_control_send_arp, arp);
427 struct takeover_callback_state {
428 struct ctdb_req_control_old *c;
429 ctdb_sock_addr *addr;
430 struct ctdb_vnn *vnn;
433 struct ctdb_do_takeip_state {
434 struct ctdb_req_control_old *c;
435 struct ctdb_vnn *vnn;
439 called when takeip event finishes
441 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
444 struct ctdb_do_takeip_state *state =
445 talloc_get_type(private_data, struct ctdb_do_takeip_state);
450 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
452 if (status == -ETIME) {
455 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
456 ctdb_addr_to_str(&state->vnn->public_address),
457 ctdb_vnn_iface_string(state->vnn)));
458 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
460 node->flags |= NODE_FLAGS_UNHEALTHY;
465 if (ctdb->do_checkpublicip) {
467 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
476 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
477 data.dsize = strlen((char *)data.dptr) + 1;
478 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
480 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
483 /* the control succeeded */
484 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
489 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
491 state->vnn->update_in_flight = false;
496 take over an ip address
498 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
499 struct ctdb_req_control_old *c,
500 struct ctdb_vnn *vnn)
503 struct ctdb_do_takeip_state *state;
505 if (vnn->update_in_flight) {
506 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
507 "update for this IP already in flight\n",
508 ctdb_addr_to_str(&vnn->public_address),
509 vnn->public_netmask_bits));
513 ret = ctdb_vnn_assign_iface(ctdb, vnn);
515 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
516 "assign a usable interface\n",
517 ctdb_addr_to_str(&vnn->public_address),
518 vnn->public_netmask_bits));
522 state = talloc(vnn, struct ctdb_do_takeip_state);
523 CTDB_NO_MEMORY(ctdb, state);
525 state->c = talloc_steal(ctdb, c);
528 vnn->update_in_flight = true;
529 talloc_set_destructor(state, ctdb_takeip_destructor);
531 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
532 ctdb_addr_to_str(&vnn->public_address),
533 vnn->public_netmask_bits,
534 ctdb_vnn_iface_string(vnn)));
536 ret = ctdb_event_script_callback(ctdb,
538 ctdb_do_takeip_callback,
542 ctdb_vnn_iface_string(vnn),
543 ctdb_addr_to_str(&vnn->public_address),
544 vnn->public_netmask_bits);
547 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
548 ctdb_addr_to_str(&vnn->public_address),
549 ctdb_vnn_iface_string(vnn)));
557 struct ctdb_do_updateip_state {
558 struct ctdb_req_control_old *c;
559 struct ctdb_interface *old;
560 struct ctdb_vnn *vnn;
564 called when updateip event finishes
566 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
569 struct ctdb_do_updateip_state *state =
570 talloc_get_type(private_data, struct ctdb_do_updateip_state);
574 if (status == -ETIME) {
577 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
578 ctdb_addr_to_str(&state->vnn->public_address),
580 ctdb_vnn_iface_string(state->vnn)));
583 * All we can do is reset the old interface
584 * and let the next run fix it
586 ctdb_vnn_unassign_iface(ctdb, state->vnn);
587 state->vnn->iface = state->old;
588 state->vnn->iface->references++;
590 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
595 if (ctdb->do_checkpublicip) {
597 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
599 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
606 /* the control succeeded */
607 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
612 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
614 state->vnn->update_in_flight = false;
619 update (move) an ip address
621 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
622 struct ctdb_req_control_old *c,
623 struct ctdb_vnn *vnn)
626 struct ctdb_do_updateip_state *state;
627 struct ctdb_interface *old = vnn->iface;
628 const char *new_name;
630 if (vnn->update_in_flight) {
631 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
632 "update for this IP already in flight\n",
633 ctdb_addr_to_str(&vnn->public_address),
634 vnn->public_netmask_bits));
638 ctdb_vnn_unassign_iface(ctdb, vnn);
639 ret = ctdb_vnn_assign_iface(ctdb, vnn);
641 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
642 "assin a usable interface (old iface '%s')\n",
643 ctdb_addr_to_str(&vnn->public_address),
644 vnn->public_netmask_bits,
649 new_name = ctdb_vnn_iface_string(vnn);
650 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
651 /* A benign update from one interface onto itself.
652 * no need to run the eventscripts in this case, just return
655 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
659 state = talloc(vnn, struct ctdb_do_updateip_state);
660 CTDB_NO_MEMORY(ctdb, state);
662 state->c = talloc_steal(ctdb, c);
666 vnn->update_in_flight = true;
667 talloc_set_destructor(state, ctdb_updateip_destructor);
669 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
670 "interface %s to %s\n",
671 ctdb_addr_to_str(&vnn->public_address),
672 vnn->public_netmask_bits,
676 ret = ctdb_event_script_callback(ctdb,
678 ctdb_do_updateip_callback,
680 CTDB_EVENT_UPDATE_IP,
684 ctdb_addr_to_str(&vnn->public_address),
685 vnn->public_netmask_bits);
687 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
688 ctdb_addr_to_str(&vnn->public_address),
689 old->name, new_name));
698 Find the vnn of the node that has a public ip address
699 returns -1 if the address is not known as a public address
701 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
703 struct ctdb_vnn *vnn;
705 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
706 if (ctdb_same_ip(&vnn->public_address, addr)) {
715 take over an ip address
717 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
718 struct ctdb_req_control_old *c,
723 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
724 struct ctdb_vnn *vnn;
725 bool have_ip = false;
726 bool do_updateip = false;
727 bool do_takeip = false;
728 struct ctdb_interface *best_iface = NULL;
730 if (pip->pnn != ctdb->pnn) {
731 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
732 "with pnn %d, but we're node %d\n",
733 ctdb_addr_to_str(&pip->addr),
734 pip->pnn, ctdb->pnn));
738 /* update out vnn list */
739 vnn = find_public_ip_vnn(ctdb, &pip->addr);
741 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
742 ctdb_addr_to_str(&pip->addr)));
746 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
747 have_ip = ctdb_sys_have_ip(&pip->addr);
749 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
750 if (best_iface == NULL) {
751 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
752 "a usable interface (old %s, have_ip %d)\n",
753 ctdb_addr_to_str(&vnn->public_address),
754 vnn->public_netmask_bits,
755 ctdb_vnn_iface_string(vnn),
760 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
761 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
766 if (vnn->iface == NULL && have_ip) {
767 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
768 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
769 ctdb_addr_to_str(&vnn->public_address)));
773 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
774 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
775 "and we have it on iface[%s], but it was assigned to node %d"
776 "and we are node %d, banning ourself\n",
777 ctdb_addr_to_str(&vnn->public_address),
778 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
783 if (vnn->pnn == -1 && have_ip) {
784 vnn->pnn = ctdb->pnn;
785 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
786 "and we already have it on iface[%s], update local daemon\n",
787 ctdb_addr_to_str(&vnn->public_address),
788 ctdb_vnn_iface_string(vnn)));
793 if (vnn->iface != best_iface) {
794 if (!vnn->iface->link_up) {
796 } else if (vnn->iface->references > (best_iface->references + 1)) {
797 /* only move when the rebalance gains something */
805 ctdb_vnn_unassign_iface(ctdb, vnn);
812 ret = ctdb_do_takeip(ctdb, c, vnn);
816 } else if (do_updateip) {
817 ret = ctdb_do_updateip(ctdb, c, vnn);
823 * The interface is up and the kernel known the ip
826 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
827 ctdb_addr_to_str(&pip->addr),
828 vnn->public_netmask_bits,
829 ctdb_vnn_iface_string(vnn)));
833 /* tell ctdb_control.c that we will be replying asynchronously */
840 kill any clients that are registered with a IP that is being released
842 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
844 struct ctdb_client_ip *ip;
846 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
847 ctdb_addr_to_str(addr)));
849 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
850 ctdb_sock_addr tmp_addr;
853 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
855 ctdb_addr_to_str(&ip->addr)));
857 if (ctdb_same_ip(&tmp_addr, addr)) {
858 struct ctdb_client *client = reqid_find(ctdb->idr,
861 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
863 ctdb_addr_to_str(&ip->addr),
866 if (client->pid != 0) {
867 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
868 (unsigned)client->pid,
869 ctdb_addr_to_str(addr),
871 kill(client->pid, SIGKILL);
877 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
879 DLIST_REMOVE(ctdb->vnn, vnn);
880 ctdb_vnn_unassign_iface(ctdb, vnn);
881 ctdb_remove_orphaned_ifaces(ctdb, vnn);
886 called when releaseip event finishes
888 static void release_ip_callback(struct ctdb_context *ctdb, int status,
891 struct takeover_callback_state *state =
892 talloc_get_type(private_data, struct takeover_callback_state);
895 if (status == -ETIME) {
899 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
900 if (ctdb_sys_have_ip(state->addr)) {
902 ("IP %s still hosted during release IP callback, failing\n",
903 ctdb_addr_to_str(state->addr)));
904 ctdb_request_control_reply(ctdb, state->c,
911 /* send a message to all clients of this node telling them
912 that the cluster has been reconfigured and they should
913 release any sockets on this IP */
914 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
915 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
916 data.dsize = strlen((char *)data.dptr)+1;
918 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
920 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
922 /* kill clients that have registered with this IP */
923 release_kill_clients(ctdb, state->addr);
925 ctdb_vnn_unassign_iface(ctdb, state->vnn);
927 /* Process the IP if it has been marked for deletion */
928 if (state->vnn->delete_pending) {
929 do_delete_ip(ctdb, state->vnn);
933 /* the control succeeded */
934 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
938 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
940 if (state->vnn != NULL) {
941 state->vnn->update_in_flight = false;
947 release an ip address
949 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
950 struct ctdb_req_control_old *c,
955 struct takeover_callback_state *state;
956 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
957 struct ctdb_vnn *vnn;
960 /* update our vnn list */
961 vnn = find_public_ip_vnn(ctdb, &pip->addr);
963 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
964 ctdb_addr_to_str(&pip->addr)));
969 /* stop any previous arps */
970 talloc_free(vnn->takeover_ctx);
971 vnn->takeover_ctx = NULL;
973 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
974 * lazy multicast to drop an IP from any node that isn't the
975 * intended new node. The following causes makes ctdbd ignore
976 * a release for any address it doesn't host.
978 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
979 if (!ctdb_sys_have_ip(&pip->addr)) {
980 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
981 ctdb_addr_to_str(&pip->addr),
982 vnn->public_netmask_bits,
983 ctdb_vnn_iface_string(vnn)));
984 ctdb_vnn_unassign_iface(ctdb, vnn);
988 if (vnn->iface == NULL) {
989 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
990 ctdb_addr_to_str(&pip->addr),
991 vnn->public_netmask_bits));
996 /* There is a potential race between take_ip and us because we
997 * update the VNN via a callback that run when the
998 * eventscripts have been run. Avoid the race by allowing one
999 * update to be in flight at a time.
1001 if (vnn->update_in_flight) {
1002 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
1003 "update for this IP already in flight\n",
1004 ctdb_addr_to_str(&vnn->public_address),
1005 vnn->public_netmask_bits));
1009 iface = strdup(ctdb_vnn_iface_string(vnn));
1011 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1012 ctdb_addr_to_str(&pip->addr),
1013 vnn->public_netmask_bits,
1017 state = talloc(ctdb, struct takeover_callback_state);
1018 if (state == NULL) {
1019 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1020 __FILE__, __LINE__);
1025 state->c = talloc_steal(state, c);
1026 state->addr = talloc(state, ctdb_sock_addr);
1027 if (state->addr == NULL) {
1028 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1029 __FILE__, __LINE__);
1034 *state->addr = pip->addr;
1037 vnn->update_in_flight = true;
1038 talloc_set_destructor(state, ctdb_releaseip_destructor);
1040 ret = ctdb_event_script_callback(ctdb,
1041 state, release_ip_callback, state,
1042 CTDB_EVENT_RELEASE_IP,
1045 ctdb_addr_to_str(&pip->addr),
1046 vnn->public_netmask_bits);
1049 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1050 ctdb_addr_to_str(&pip->addr),
1051 ctdb_vnn_iface_string(vnn)));
1056 /* tell the control that we will be reply asynchronously */
1057 *async_reply = true;
1061 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1062 ctdb_sock_addr *addr,
1063 unsigned mask, const char *ifaces,
1066 struct ctdb_vnn *vnn;
1073 tmp = strdup(ifaces);
1074 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1075 if (!ctdb_sys_check_iface_exists(iface)) {
1076 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1083 /* Verify that we don't have an entry for this ip yet */
1084 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1085 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1086 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1087 ctdb_addr_to_str(addr)));
1092 /* create a new vnn structure for this ip address */
1093 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1094 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1095 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1096 tmp = talloc_strdup(vnn, ifaces);
1097 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1098 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1099 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1100 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1101 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1102 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1106 vnn->ifaces[num] = NULL;
1107 vnn->public_address = *addr;
1108 vnn->public_netmask_bits = mask;
1110 if (check_address) {
1111 if (ctdb_sys_have_ip(addr)) {
1112 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1113 vnn->pnn = ctdb->pnn;
1117 for (i=0; vnn->ifaces[i]; i++) {
1118 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1120 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1121 "for public_address[%s]\n",
1122 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1128 DLIST_ADD(ctdb->vnn, vnn);
1134 setup the public address lists from a file
1136 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1142 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1143 if (lines == NULL) {
1144 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1147 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1151 for (i=0;i<nlines;i++) {
1153 ctdb_sock_addr addr;
1154 const char *addrstr;
1159 while ((*line == ' ') || (*line == '\t')) {
1165 if (strcmp(line, "") == 0) {
1168 tok = strtok(line, " \t");
1170 tok = strtok(NULL, " \t");
1172 if (NULL == ctdb->default_public_interface) {
1173 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1178 ifaces = ctdb->default_public_interface;
1183 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1184 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1188 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1189 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1200 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1204 struct ctdb_vnn *svnn;
1205 struct ctdb_interface *cur = NULL;
1209 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1210 CTDB_NO_MEMORY(ctdb, svnn);
1212 svnn->ifaces = talloc_array(svnn, const char *, 2);
1213 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1214 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1215 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1216 svnn->ifaces[1] = NULL;
1218 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1224 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1226 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1227 "for single_ip[%s]\n",
1229 ctdb_addr_to_str(&svnn->public_address)));
1234 /* assume the single public ip interface is initially "good" */
1235 cur = ctdb_find_iface(ctdb, iface);
1237 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1240 cur->link_up = true;
1242 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1248 ctdb->single_ip_vnn = svnn;
1252 struct public_ip_list {
1253 struct public_ip_list *next;
1255 ctdb_sock_addr addr;
1258 /* Given a physical node, return the number of
1259 public addresses that is currently assigned to this node.
1261 static int node_ip_coverage(int32_t pnn, struct public_ip_list *ips)
1265 for (;ips;ips=ips->next) {
1266 if (ips->pnn == pnn) {
1274 /* Can the given node host the given IP: is the public IP known to the
1275 * node and is NOIPHOST unset?
1277 static bool can_node_host_ip(struct ipalloc_state *ipalloc_state,
1279 struct public_ip_list *ip)
1281 struct ctdb_public_ip_list_old *public_ips;
1284 if (ipalloc_state->noiphost[pnn]) {
1288 public_ips = ipalloc_state->available_public_ips[pnn];
1290 if (public_ips == NULL) {
1294 for (i=0; i<public_ips->num; i++) {
1295 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1296 /* yes, this node can serve this public ip */
1304 static bool can_node_takeover_ip(struct ipalloc_state *ipalloc_state,
1306 struct public_ip_list *ip)
1308 if (ipalloc_state->noiptakeover[pnn]) {
1312 return can_node_host_ip(ipalloc_state, pnn, ip);
1315 /* search the node lists list for a node to takeover this ip.
1316 pick the node that currently are serving the least number of ips
1317 so that the ips get spread out evenly.
1319 static int find_takeover_node(struct ipalloc_state *ipalloc_state,
1320 struct public_ip_list *ip)
1322 int pnn, min=0, num;
1325 numnodes = ipalloc_state->num;
1327 for (i=0; i<numnodes; i++) {
1328 /* verify that this node can serve this ip */
1329 if (!can_node_takeover_ip(ipalloc_state, i, ip)) {
1330 /* no it couldnt so skip to the next node */
1334 num = node_ip_coverage(i, ipalloc_state->all_ips);
1335 /* was this the first node we checked ? */
1347 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1348 ctdb_addr_to_str(&ip->addr)));
1358 static uint32_t *ip_key(ctdb_sock_addr *ip)
1360 static uint32_t key[IP_KEYLEN];
1362 bzero(key, sizeof(key));
1364 switch (ip->sa.sa_family) {
1366 key[3] = htonl(ip->ip.sin_addr.s_addr);
1369 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1370 key[0] = htonl(s6_a32[0]);
1371 key[1] = htonl(s6_a32[1]);
1372 key[2] = htonl(s6_a32[2]);
1373 key[3] = htonl(s6_a32[3]);
1377 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1384 static void *add_ip_callback(void *parm, void *data)
1386 struct public_ip_list *this_ip = parm;
1387 struct public_ip_list *prev_ip = data;
1389 if (prev_ip == NULL) {
1392 if (this_ip->pnn == -1) {
1393 this_ip->pnn = prev_ip->pnn;
1399 static int getips_count_callback(void *param, void *data)
1401 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1402 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1404 new_ip->next = *ip_list;
1409 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1410 struct ctdb_public_ip_list_old *ips,
1413 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1414 struct ipalloc_state *ipalloc_state,
1415 struct ctdb_node_map_old *nodemap)
1420 if (ipalloc_state->num != nodemap->num) {
1423 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1424 ipalloc_state->num, nodemap->num));
1428 for (j=0; j<nodemap->num; j++) {
1429 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1433 /* Retrieve the list of known public IPs from the node */
1434 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1437 ipalloc_state->known_public_ips,
1439 &ipalloc_state->known_public_ips[j]);
1442 ("Failed to read known public IPs from node: %u\n",
1447 if (ctdb->do_checkpublicip) {
1448 verify_remote_ip_allocation(ctdb,
1449 ipalloc_state->known_public_ips[j],
1453 /* Retrieve the list of available public IPs from the node */
1454 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1457 ipalloc_state->available_public_ips,
1458 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1459 &ipalloc_state->available_public_ips[j]);
1462 ("Failed to read available public IPs from node: %u\n",
1471 static struct public_ip_list *
1472 create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1475 struct public_ip_list *ip_list;
1476 struct ctdb_public_ip_list_old *public_ips;
1478 TALLOC_FREE(ctdb->ip_tree);
1479 ctdb->ip_tree = trbt_create(ctdb, 0);
1481 for (i=0; i < ctdb->num_nodes; i++) {
1482 public_ips = ipalloc_state->known_public_ips[i];
1484 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1488 /* there were no public ips for this node */
1489 if (public_ips == NULL) {
1493 for (j=0; j < public_ips->num; j++) {
1494 struct public_ip_list *tmp_ip;
1496 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1497 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1498 /* Do not use information about IP addresses hosted
1499 * on other nodes, it may not be accurate */
1500 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1501 tmp_ip->pnn = public_ips->ips[j].pnn;
1505 tmp_ip->addr = public_ips->ips[j].addr;
1506 tmp_ip->next = NULL;
1508 trbt_insertarray32_callback(ctdb->ip_tree,
1509 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1516 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1522 * This is the length of the longtest common prefix between the IPs.
1523 * It is calculated by XOR-ing the 2 IPs together and counting the
1524 * number of leading zeroes. The implementation means that all
1525 * addresses end up being 128 bits long.
1527 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1528 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1529 * lots of nodes and IP addresses?
1531 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1533 uint32_t ip1_k[IP_KEYLEN];
1538 uint32_t distance = 0;
1540 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1542 for (i=0; i<IP_KEYLEN; i++) {
1543 x = ip1_k[i] ^ t[i];
1547 /* Count number of leading zeroes.
1548 * FIXME? This could be optimised...
1550 while ((x & (1 << 31)) == 0) {
1560 /* Calculate the IP distance for the given IP relative to IPs on the
1561 given node. The ips argument is generally the all_ips variable
1562 used in the main part of the algorithm.
1564 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1565 struct public_ip_list *ips,
1568 struct public_ip_list *t;
1573 for (t = ips; t != NULL; t = t->next) {
1574 if (t->pnn != pnn) {
1578 /* Optimisation: We never calculate the distance
1579 * between an address and itself. This allows us to
1580 * calculate the effect of removing an address from a
1581 * node by simply calculating the distance between
1582 * that address and all of the exitsing addresses.
1583 * Moreover, we assume that we're only ever dealing
1584 * with addresses from all_ips so we can identify an
1585 * address via a pointer rather than doing a more
1586 * expensive address comparison. */
1587 if (&(t->addr) == ip) {
1591 d = ip_distance(ip, &(t->addr));
1592 sum += d * d; /* Cheaper than pulling in math.h :-) */
1598 /* Return the LCP2 imbalance metric for addresses currently assigned
1601 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips, int pnn)
1603 struct public_ip_list *t;
1605 uint32_t imbalance = 0;
1607 for (t = all_ips; t != NULL; t = t->next) {
1608 if (t->pnn != pnn) {
1611 /* Pass the rest of the IPs rather than the whole
1614 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1620 /* Allocate any unassigned IPs just by looping through the IPs and
1621 * finding the best node for each.
1623 static void basic_allocate_unassigned(struct ipalloc_state *ipalloc_state)
1625 struct public_ip_list *t;
1627 /* loop over all ip's and find a physical node to cover for
1630 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1632 if (find_takeover_node(ipalloc_state, t)) {
1633 DEBUG(DEBUG_WARNING,
1634 ("Failed to find node to cover ip %s\n",
1635 ctdb_addr_to_str(&t->addr)));
1641 /* Basic non-deterministic rebalancing algorithm.
1643 static void basic_failback(struct ipalloc_state *ipalloc_state,
1647 int maxnode, maxnum, minnode, minnum, num, retries;
1648 struct public_ip_list *t;
1650 numnodes = ipalloc_state->num;
1657 /* for each ip address, loop over all nodes that can serve
1658 this ip and make sure that the difference between the node
1659 serving the most and the node serving the least ip's are
1662 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1667 /* Get the highest and lowest number of ips's served by any
1668 valid node which can serve this ip.
1672 for (i=0; i<numnodes; i++) {
1673 /* only check nodes that can actually serve this ip */
1674 if (!can_node_takeover_ip(ipalloc_state, i,
1676 /* no it couldnt so skip to the next node */
1680 num = node_ip_coverage(i, ipalloc_state->all_ips);
1681 if (maxnode == -1) {
1690 if (minnode == -1) {
1700 if (maxnode == -1) {
1701 DEBUG(DEBUG_WARNING,
1702 (__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1703 ctdb_addr_to_str(&t->addr)));
1708 /* if the spread between the smallest and largest coverage by
1709 a node is >=2 we steal one of the ips from the node with
1710 most coverage to even things out a bit.
1711 try to do this a limited number of times since we dont
1712 want to spend too much time balancing the ip coverage.
1714 if ((maxnum > minnum+1) &&
1715 (retries < (num_ips + 5))){
1716 struct public_ip_list *tt;
1718 /* Reassign one of maxnode's VNNs */
1719 for (tt = ipalloc_state->all_ips; tt != NULL; tt = tt->next) {
1720 if (tt->pnn == maxnode) {
1721 (void)find_takeover_node(ipalloc_state,
1731 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
1732 uint32_t **lcp2_imbalances,
1733 bool **rebalance_candidates)
1736 struct public_ip_list *t;
1738 numnodes = ipalloc_state->num;
1740 *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
1741 if (*rebalance_candidates == NULL) {
1742 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1745 *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
1746 if (*lcp2_imbalances == NULL) {
1747 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1751 for (i=0; i<numnodes; i++) {
1752 (*lcp2_imbalances)[i] =
1753 lcp2_imbalance(ipalloc_state->all_ips, i);
1754 /* First step: assume all nodes are candidates */
1755 (*rebalance_candidates)[i] = true;
1758 /* 2nd step: if a node has IPs assigned then it must have been
1759 * healthy before, so we remove it from consideration. This
1760 * is overkill but is all we have because we don't maintain
1761 * state between takeover runs. An alternative would be to
1762 * keep state and invalidate it every time the recovery master
1765 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1767 (*rebalance_candidates)[t->pnn] = false;
1771 /* 3rd step: if a node is forced to re-balance then
1772 we allow failback onto the node */
1773 if (ipalloc_state->force_rebalance_nodes == NULL) {
1777 i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
1779 uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
1780 if (pnn >= numnodes) {
1782 (__location__ "unknown node %u\n", pnn));
1787 ("Forcing rebalancing of IPs to node %u\n", pnn));
1788 (*rebalance_candidates)[pnn] = true;
1794 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1795 * the IP/node combination that will cost the least.
1797 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
1798 uint32_t *lcp2_imbalances)
1800 struct public_ip_list *t;
1801 int dstnode, numnodes;
1804 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1805 struct public_ip_list *minip;
1807 bool should_loop = true;
1808 bool have_unassigned = true;
1810 numnodes = ipalloc_state->num;
1812 while (have_unassigned && should_loop) {
1813 should_loop = false;
1815 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1816 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1822 /* loop over each unassigned ip. */
1823 for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
1828 for (dstnode = 0; dstnode < numnodes; dstnode++) {
1829 /* only check nodes that can actually takeover this ip */
1830 if (!can_node_takeover_ip(ipalloc_state,
1833 /* no it couldnt so skip to the next node */
1837 dstdsum = ip_distance_2_sum(&(t->addr),
1838 ipalloc_state->all_ips,
1840 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1842 (" %s -> %d [+%d]\n",
1843 ctdb_addr_to_str(&(t->addr)),
1845 dstimbl - lcp2_imbalances[dstnode]));
1848 if ((minnode == -1) || (dstdsum < mindsum)) {
1858 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1860 /* If we found one then assign it to the given node. */
1861 if (minnode != -1) {
1862 minip->pnn = minnode;
1863 lcp2_imbalances[minnode] = minimbl;
1864 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1865 ctdb_addr_to_str(&(minip->addr)),
1870 /* There might be a better way but at least this is clear. */
1871 have_unassigned = false;
1872 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1874 have_unassigned = true;
1879 /* We know if we have an unassigned addresses so we might as
1882 if (have_unassigned) {
1883 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1885 DEBUG(DEBUG_WARNING,
1886 ("Failed to find node to cover ip %s\n",
1887 ctdb_addr_to_str(&t->addr)));
1893 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1894 * to move IPs from, determines the best IP/destination node
1895 * combination to move from the source node.
1897 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
1899 uint32_t *lcp2_imbalances,
1900 bool *rebalance_candidates)
1902 int dstnode, mindstnode, numnodes;
1903 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1904 uint32_t minsrcimbl, mindstimbl;
1905 struct public_ip_list *minip;
1906 struct public_ip_list *t;
1908 /* Find an IP and destination node that best reduces imbalance. */
1915 numnodes = ipalloc_state->num;
1917 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1918 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1919 srcnode, lcp2_imbalances[srcnode]));
1921 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
1922 /* Only consider addresses on srcnode. */
1923 if (t->pnn != srcnode) {
1927 /* What is this IP address costing the source node? */
1928 srcdsum = ip_distance_2_sum(&(t->addr),
1929 ipalloc_state->all_ips,
1931 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1933 /* Consider this IP address would cost each potential
1934 * destination node. Destination nodes are limited to
1935 * those that are newly healthy, since we don't want
1936 * to do gratuitous failover of IPs just to make minor
1937 * balance improvements.
1939 for (dstnode = 0; dstnode < numnodes; dstnode++) {
1940 if (!rebalance_candidates[dstnode]) {
1944 /* only check nodes that can actually takeover this ip */
1945 if (!can_node_takeover_ip(ipalloc_state, dstnode,
1947 /* no it couldnt so skip to the next node */
1951 dstdsum = ip_distance_2_sum(&(t->addr),
1952 ipalloc_state->all_ips,
1954 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1955 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1957 ctdb_addr_to_str(&(t->addr)),
1960 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1961 (dstdsum < srcdsum) && \
1962 ((mindstnode == -1) || \
1963 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1966 minsrcimbl = srcimbl;
1967 mindstnode = dstnode;
1968 mindstimbl = dstimbl;
1972 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1974 if (mindstnode != -1) {
1975 /* We found a move that makes things better... */
1977 ("%d [%d] -> %s -> %d [+%d]\n",
1978 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1979 ctdb_addr_to_str(&(minip->addr)),
1980 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1983 lcp2_imbalances[srcnode] = minsrcimbl;
1984 lcp2_imbalances[mindstnode] = mindstimbl;
1985 minip->pnn = mindstnode;
1994 struct lcp2_imbalance_pnn {
1999 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2001 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2002 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2004 if (lipa->imbalance > lipb->imbalance) {
2006 } else if (lipa->imbalance == lipb->imbalance) {
2013 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2014 * node with the highest LCP2 imbalance, and then determines the best
2015 * IP/destination node combination to move from the source node.
2017 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
2018 uint32_t *lcp2_imbalances,
2019 bool *rebalance_candidates)
2022 struct lcp2_imbalance_pnn * lips;
2025 numnodes = ipalloc_state->num;
2028 /* Put the imbalances and nodes into an array, sort them and
2029 * iterate through candidates. Usually the 1st one will be
2030 * used, so this doesn't cost much...
2032 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2033 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2034 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
2035 for (i = 0; i < numnodes; i++) {
2036 lips[i].imbalance = lcp2_imbalances[i];
2038 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2040 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2041 lcp2_cmp_imbalance_pnn);
2044 for (i = 0; i < numnodes; i++) {
2045 /* This means that all nodes had 0 or 1 addresses, so
2046 * can't be imbalanced.
2048 if (lips[i].imbalance == 0) {
2052 if (lcp2_failback_candidate(ipalloc_state,
2055 rebalance_candidates)) {
2067 static void unassign_unsuitable_ips(struct ipalloc_state *ipalloc_state)
2069 struct public_ip_list *t;
2071 /* verify that the assigned nodes can serve that public ip
2072 and set it to -1 if not
2074 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
2078 if (!can_node_host_ip(ipalloc_state, t->pnn, t) != 0) {
2079 /* this node can not serve this ip. */
2080 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2081 ctdb_addr_to_str(&(t->addr)),
2088 static bool ipalloc_deterministic(struct ipalloc_state *ipalloc_state)
2090 struct public_ip_list *t;
2093 numnodes = ipalloc_state->num;
2095 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2096 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2097 * always be allocated the same way for a specific set of
2098 * available/unavailable nodes.
2101 for (i = 0, t = ipalloc_state->all_ips; t!= NULL; t = t->next, i++) {
2102 t->pnn = i % numnodes;
2105 /* IP failback doesn't make sense with deterministic
2106 * IPs, since the modulo step above implicitly fails
2107 * back IPs to their "home" node.
2109 if (1 == ipalloc_state->no_ip_failback) {
2110 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2113 unassign_unsuitable_ips(ipalloc_state);
2115 basic_allocate_unassigned(ipalloc_state);
2117 /* No failback here! */
2122 static bool ipalloc_nondeterministic(struct ipalloc_state *ipalloc_state)
2124 /* This should be pushed down into basic_failback. */
2125 struct public_ip_list *t;
2127 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
2131 unassign_unsuitable_ips(ipalloc_state);
2133 basic_allocate_unassigned(ipalloc_state);
2135 /* If we don't want IPs to fail back then don't rebalance IPs. */
2136 if (1 == ipalloc_state->no_ip_failback) {
2140 /* Now, try to make sure the ip adresses are evenly distributed
2143 basic_failback(ipalloc_state, num_ips);
2148 static bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
2150 uint32_t *lcp2_imbalances;
2151 bool *rebalance_candidates;
2152 int numnodes, num_rebalance_candidates, i;
2155 unassign_unsuitable_ips(ipalloc_state);
2157 if (!lcp2_init(ipalloc_state,
2158 &lcp2_imbalances, &rebalance_candidates)) {
2163 lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
2165 /* If we don't want IPs to fail back then don't rebalance IPs. */
2166 if (1 == ipalloc_state->no_ip_failback) {
2170 /* It is only worth continuing if we have suitable target
2171 * nodes to transfer IPs to. This check is much cheaper than
2174 numnodes = ipalloc_state->num;
2175 num_rebalance_candidates = 0;
2176 for (i=0; i<numnodes; i++) {
2177 if (rebalance_candidates[i]) {
2178 num_rebalance_candidates++;
2181 if (num_rebalance_candidates == 0) {
2185 /* Now, try to make sure the ip adresses are evenly distributed
2188 lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
2194 static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
2198 for (i=0;i<nodemap->num;i++) {
2199 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2200 /* Found one completely healthy node */
2208 /* The calculation part of the IP allocation algorithm. */
2209 static bool ipalloc(struct ipalloc_state *ipalloc_state)
2213 switch (ipalloc_state->algorithm) {
2215 ret = ipalloc_lcp2(ipalloc_state);
2217 case IPALLOC_DETERMINISTIC:
2218 ret = ipalloc_deterministic(ipalloc_state);
2220 case IPALLOC_NONDETERMINISTIC:
2221 ret = ipalloc_nondeterministic(ipalloc_state);
2225 /* at this point ->pnn is the node which will own each IP
2226 or -1 if there is no node that can cover this ip
2232 struct get_tunable_callback_data {
2233 const char *tunable;
2238 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2239 int32_t res, TDB_DATA outdata,
2242 struct get_tunable_callback_data *cd =
2243 (struct get_tunable_callback_data *)callback;
2247 /* Already handled in fail callback */
2251 if (outdata.dsize != sizeof(uint32_t)) {
2252 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2253 cd->tunable, pnn, (int)sizeof(uint32_t),
2254 (int)outdata.dsize));
2259 size = talloc_array_length(cd->out);
2261 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2262 cd->tunable, pnn, size));
2267 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2270 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2271 int32_t res, TDB_DATA outdata,
2274 struct get_tunable_callback_data *cd =
2275 (struct get_tunable_callback_data *)callback;
2280 ("Timed out getting tunable \"%s\" from node %d\n",
2286 DEBUG(DEBUG_WARNING,
2287 ("Tunable \"%s\" not implemented on node %d\n",
2292 ("Unexpected error getting tunable \"%s\" from node %d\n",
2298 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2299 TALLOC_CTX *tmp_ctx,
2300 struct ctdb_node_map_old *nodemap,
2301 const char *tunable,
2302 uint32_t default_value)
2305 struct ctdb_control_get_tunable *t;
2308 struct get_tunable_callback_data callback_data;
2311 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2312 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2313 for (i=0; i<nodemap->num; i++) {
2314 tvals[i] = default_value;
2317 callback_data.out = tvals;
2318 callback_data.tunable = tunable;
2319 callback_data.fatal = false;
2321 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2322 data.dptr = talloc_size(tmp_ctx, data.dsize);
2323 t = (struct ctdb_control_get_tunable *)data.dptr;
2324 t->length = strlen(tunable)+1;
2325 memcpy(t->name, tunable, t->length);
2326 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2327 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2328 nodes, 0, TAKEOVER_TIMEOUT(),
2330 get_tunable_callback,
2331 get_tunable_fail_callback,
2332 &callback_data) != 0) {
2333 if (callback_data.fatal) {
2339 talloc_free(data.dptr);
2344 /* Set internal flags for IP allocation:
2346 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2347 * Set NOIPHOST ip flag for each INACTIVE node
2348 * if all nodes are disabled:
2349 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2351 * Set NOIPHOST ip flags for disabled nodes
2353 static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
2354 struct ctdb_node_map_old *nodemap,
2355 uint32_t *tval_noiptakeover,
2356 uint32_t *tval_noiphostonalldisabled)
2360 for (i=0;i<nodemap->num;i++) {
2361 /* Can not take IPs on node with NoIPTakeover set */
2362 if (tval_noiptakeover[i] != 0) {
2363 ipalloc_state->noiptakeover[i] = true;
2366 /* Can not host IPs on INACTIVE node */
2367 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2368 ipalloc_state->noiphost[i] = true;
2372 if (all_nodes_are_disabled(nodemap)) {
2373 /* If all nodes are disabled, can not host IPs on node
2374 * with NoIPHostOnAllDisabled set
2376 for (i=0;i<nodemap->num;i++) {
2377 if (tval_noiphostonalldisabled[i] != 0) {
2378 ipalloc_state->noiphost[i] = true;
2382 /* If some nodes are not disabled, then can not host
2383 * IPs on DISABLED node
2385 for (i=0;i<nodemap->num;i++) {
2386 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2387 ipalloc_state->noiphost[i] = true;
2393 static bool set_ipflags(struct ctdb_context *ctdb,
2394 struct ipalloc_state *ipalloc_state,
2395 struct ctdb_node_map_old *nodemap)
2397 uint32_t *tval_noiptakeover;
2398 uint32_t *tval_noiphostonalldisabled;
2400 tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2402 if (tval_noiptakeover == NULL) {
2406 tval_noiphostonalldisabled =
2407 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
2408 "NoIPHostOnAllDisabled", 0);
2409 if (tval_noiphostonalldisabled == NULL) {
2410 /* Caller frees tmp_ctx */
2414 set_ipflags_internal(ipalloc_state, nodemap,
2416 tval_noiphostonalldisabled);
2418 talloc_free(tval_noiptakeover);
2419 talloc_free(tval_noiphostonalldisabled);
2424 static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
2425 TALLOC_CTX *mem_ctx)
2427 struct ipalloc_state *ipalloc_state =
2428 talloc_zero(mem_ctx, struct ipalloc_state);
2429 if (ipalloc_state == NULL) {
2430 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2434 ipalloc_state->num = ctdb->num_nodes;
2435 ipalloc_state->known_public_ips =
2436 talloc_zero_array(ipalloc_state,
2437 struct ctdb_public_ip_list_old *,
2438 ipalloc_state->num);
2439 if (ipalloc_state->known_public_ips == NULL) {
2440 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2441 talloc_free(ipalloc_state);
2444 ipalloc_state->available_public_ips =
2445 talloc_zero_array(ipalloc_state,
2446 struct ctdb_public_ip_list_old *,
2447 ipalloc_state->num);
2448 if (ipalloc_state->available_public_ips == NULL) {
2449 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2450 talloc_free(ipalloc_state);
2453 ipalloc_state->noiptakeover =
2454 talloc_zero_array(ipalloc_state,
2456 ipalloc_state->num);
2457 if (ipalloc_state->noiptakeover == NULL) {
2458 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2459 talloc_free(ipalloc_state);
2462 ipalloc_state->noiphost =
2463 talloc_zero_array(ipalloc_state,
2465 ipalloc_state->num);
2466 if (ipalloc_state->noiphost == NULL) {
2467 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
2468 talloc_free(ipalloc_state);
2472 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2473 ipalloc_state->algorithm = IPALLOC_LCP2;
2474 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2475 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
2477 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
2480 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
2482 return ipalloc_state;
2485 struct iprealloc_callback_data {
2488 client_async_callback fail_callback;
2489 void *fail_callback_data;
2490 struct ctdb_node_map_old *nodemap;
2493 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2494 int32_t res, TDB_DATA outdata,
2498 struct iprealloc_callback_data *cd =
2499 (struct iprealloc_callback_data *)callback;
2501 numnodes = talloc_array_length(cd->retry_nodes);
2502 if (pnn > numnodes) {
2504 ("ipreallocated failure from node %d, "
2505 "but only %d nodes in nodemap\n",
2510 /* Can't run the "ipreallocated" event on a INACTIVE node */
2511 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2512 DEBUG(DEBUG_WARNING,
2513 ("ipreallocated failed on inactive node %d, ignoring\n",
2520 /* If the control timed out then that's a real error,
2521 * so call the real fail callback
2523 if (cd->fail_callback) {
2524 cd->fail_callback(ctdb, pnn, res, outdata,
2525 cd->fail_callback_data);
2527 DEBUG(DEBUG_WARNING,
2528 ("iprealloc timed out but no callback registered\n"));
2532 /* If not a timeout then either the ipreallocated
2533 * eventscript (or some setup) failed. This might
2534 * have failed because the IPREALLOCATED control isn't
2535 * implemented - right now there is no way of knowing
2536 * because the error codes are all folded down to -1.
2537 * Consider retrying using EVENTSCRIPT control...
2539 DEBUG(DEBUG_WARNING,
2540 ("ipreallocated failure from node %d, flagging retry\n",
2542 cd->retry_nodes[pnn] = true;
2547 struct takeover_callback_data {
2549 client_async_callback fail_callback;
2550 void *fail_callback_data;
2551 struct ctdb_node_map_old *nodemap;
2554 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2555 uint32_t node_pnn, int32_t res,
2556 TDB_DATA outdata, void *callback_data)
2558 struct takeover_callback_data *cd =
2559 talloc_get_type_abort(callback_data,
2560 struct takeover_callback_data);
2563 for (i = 0; i < cd->nodemap->num; i++) {
2564 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2569 if (i == cd->nodemap->num) {
2570 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2574 if (!cd->node_failed[i]) {
2575 cd->node_failed[i] = true;
2576 cd->fail_callback(ctdb, node_pnn, res, outdata,
2577 cd->fail_callback_data);
2582 make any IP alias changes for public addresses that are necessary
2584 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
2585 uint32_t *force_rebalance_nodes,
2586 client_async_callback fail_callback, void *callback_data)
2589 struct ctdb_public_ip ip;
2591 struct public_ip_list *all_ips, *tmp_ip;
2593 struct timeval timeout;
2594 struct client_async_data *async_data;
2595 struct ctdb_client_control_state *state;
2596 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2597 struct ipalloc_state *ipalloc_state;
2598 struct takeover_callback_data *takeover_data;
2599 struct iprealloc_callback_data iprealloc_data;
2604 * ip failover is completely disabled, just send out the
2605 * ipreallocated event.
2607 if (ctdb->tunable.disable_ip_failover != 0) {
2611 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
2612 if (ipalloc_state == NULL) {
2613 talloc_free(tmp_ctx);
2617 if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
2618 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2619 talloc_free(tmp_ctx);
2623 /* Fetch known/available public IPs from each active node */
2624 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
2626 talloc_free(tmp_ctx);
2630 /* Short-circuit IP allocation if no node has available IPs */
2631 can_host_ips = false;
2632 for (i=0; i < ipalloc_state->num; i++) {
2633 if (ipalloc_state->available_public_ips[i] != NULL) {
2634 can_host_ips = true;
2637 if (!can_host_ips) {
2638 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2642 /* since nodes only know about those public addresses that
2643 can be served by that particular node, no single node has
2644 a full list of all public addresses that exist in the cluster.
2645 Walk over all node structures and create a merged list of
2646 all public addresses that exist in the cluster.
2648 keep the tree of ips around as ctdb->ip_tree
2650 all_ips = create_merged_ip_list(ctdb, ipalloc_state);
2651 ipalloc_state->all_ips = all_ips;
2653 ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
2655 /* Do the IP reassignment calculations */
2656 ipalloc(ipalloc_state);
2658 /* Now tell all nodes to release any public IPs should not
2659 * host. This will be a NOOP on nodes that don't currently
2660 * hold the given IP.
2662 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2663 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2665 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2666 bool, nodemap->num);
2667 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2668 takeover_data->fail_callback = fail_callback;
2669 takeover_data->fail_callback_data = callback_data;
2670 takeover_data->nodemap = nodemap;
2672 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2673 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2675 async_data->fail_callback = takeover_run_fail_callback;
2676 async_data->callback_data = takeover_data;
2678 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2680 /* Send a RELEASE_IP to all nodes that should not be hosting
2681 * each IP. For each IP, all but one of these will be
2682 * redundant. However, the redundant ones are used to tell
2683 * nodes which node should be hosting the IP so that commands
2684 * like "ctdb ip" can display a particular nodes idea of who
2685 * is hosting what. */
2686 for (i=0;i<nodemap->num;i++) {
2687 /* don't talk to unconnected nodes, but do talk to banned nodes */
2688 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2692 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2693 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2694 /* This node should be serving this
2695 vnn so don't tell it to release the ip
2699 ip.pnn = tmp_ip->pnn;
2700 ip.addr = tmp_ip->addr;
2702 timeout = TAKEOVER_TIMEOUT();
2703 data.dsize = sizeof(ip);
2704 data.dptr = (uint8_t *)&ip;
2705 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2706 0, CTDB_CONTROL_RELEASE_IP, 0,
2709 if (state == NULL) {
2710 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2711 talloc_free(tmp_ctx);
2715 ctdb_client_async_add(async_data, state);
2718 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2719 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2720 talloc_free(tmp_ctx);
2723 talloc_free(async_data);
2726 /* For each IP, send a TAKOVER_IP to the node that should be
2727 * hosting it. Many of these will often be redundant (since
2728 * the allocation won't have changed) but they can be useful
2729 * to recover from inconsistencies. */
2730 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2731 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2733 async_data->fail_callback = fail_callback;
2734 async_data->callback_data = callback_data;
2736 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2737 if (tmp_ip->pnn == -1) {
2738 /* this IP won't be taken over */
2742 ip.pnn = tmp_ip->pnn;
2743 ip.addr = tmp_ip->addr;
2745 timeout = TAKEOVER_TIMEOUT();
2746 data.dsize = sizeof(ip);
2747 data.dptr = (uint8_t *)&ip;
2748 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2749 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2750 data, async_data, &timeout, NULL);
2751 if (state == NULL) {
2752 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2753 talloc_free(tmp_ctx);
2757 ctdb_client_async_add(async_data, state);
2759 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2760 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2761 talloc_free(tmp_ctx);
2767 * Tell all nodes to run eventscripts to process the
2768 * "ipreallocated" event. This can do a lot of things,
2769 * including restarting services to reconfigure them if public
2770 * IPs have moved. Once upon a time this event only used to
2773 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2774 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2775 iprealloc_data.retry_nodes = retry_data;
2776 iprealloc_data.retry_count = 0;
2777 iprealloc_data.fail_callback = fail_callback;
2778 iprealloc_data.fail_callback_data = callback_data;
2779 iprealloc_data.nodemap = nodemap;
2781 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2782 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2783 nodes, 0, TAKEOVER_TIMEOUT(),
2785 NULL, iprealloc_fail_callback,
2788 /* If the control failed then we should retry to any
2789 * nodes flagged by iprealloc_fail_callback using the
2790 * EVENTSCRIPT control. This is a best-effort at
2791 * backward compatiblity when running a mixed cluster
2792 * where some nodes have not yet been upgraded to
2793 * support the IPREALLOCATED control.
2795 DEBUG(DEBUG_WARNING,
2796 ("Retry ipreallocated to some nodes using eventscript control\n"));
2798 nodes = talloc_array(tmp_ctx, uint32_t,
2799 iprealloc_data.retry_count);
2800 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2803 for (i=0; i<nodemap->num; i++) {
2804 if (iprealloc_data.retry_nodes[i]) {
2810 data.dptr = discard_const("ipreallocated");
2811 data.dsize = strlen((char *)data.dptr) + 1;
2812 ret = ctdb_client_async_control(ctdb,
2813 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2814 nodes, 0, TAKEOVER_TIMEOUT(),
2816 NULL, fail_callback,
2819 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2823 talloc_free(tmp_ctx);
2829 destroy a ctdb_client_ip structure
2831 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2833 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2834 ctdb_addr_to_str(&ip->addr),
2835 ntohs(ip->addr.ip.sin_port),
2838 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2843 called by a client to inform us of a TCP connection that it is managing
2844 that should tickled with an ACK when IP takeover is done
2846 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2849 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2850 struct ctdb_connection *tcp_sock = NULL;
2851 struct ctdb_tcp_list *tcp;
2852 struct ctdb_connection t;
2855 struct ctdb_client_ip *ip;
2856 struct ctdb_vnn *vnn;
2857 ctdb_sock_addr addr;
2859 /* If we don't have public IPs, tickles are useless */
2860 if (ctdb->vnn == NULL) {
2864 tcp_sock = (struct ctdb_connection *)indata.dptr;
2866 addr = tcp_sock->src;
2867 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2868 addr = tcp_sock->dst;
2869 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2872 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2873 vnn = find_public_ip_vnn(ctdb, &addr);
2875 switch (addr.sa.sa_family) {
2877 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2878 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2879 ctdb_addr_to_str(&addr)));
2883 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2884 ctdb_addr_to_str(&addr)));
2887 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2893 if (vnn->pnn != ctdb->pnn) {
2894 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2895 ctdb_addr_to_str(&addr),
2896 client_id, client->pid));
2897 /* failing this call will tell smbd to die */
2901 ip = talloc(client, struct ctdb_client_ip);
2902 CTDB_NO_MEMORY(ctdb, ip);
2906 ip->client_id = client_id;
2907 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2908 DLIST_ADD(ctdb->client_ip_list, ip);
2910 tcp = talloc(client, struct ctdb_tcp_list);
2911 CTDB_NO_MEMORY(ctdb, tcp);
2913 tcp->connection.src = tcp_sock->src;
2914 tcp->connection.dst = tcp_sock->dst;
2916 DLIST_ADD(client->tcp_list, tcp);
2918 t.src = tcp_sock->src;
2919 t.dst = tcp_sock->dst;
2921 data.dptr = (uint8_t *)&t;
2922 data.dsize = sizeof(t);
2924 switch (addr.sa.sa_family) {
2926 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2927 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2928 ctdb_addr_to_str(&tcp_sock->src),
2929 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2932 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2933 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2934 ctdb_addr_to_str(&tcp_sock->src),
2935 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2938 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2942 /* tell all nodes about this tcp connection */
2943 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2944 CTDB_CONTROL_TCP_ADD,
2945 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2947 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2955 find a tcp address on a list
2957 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2958 struct ctdb_connection *tcp)
2962 if (array == NULL) {
2966 for (i=0;i<array->num;i++) {
2967 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2968 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2969 return &array->connections[i];
2978 called by a daemon to inform us of a TCP connection that one of its
2979 clients managing that should tickled with an ACK when IP takeover is
2982 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2984 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2985 struct ctdb_tcp_array *tcparray;
2986 struct ctdb_connection tcp;
2987 struct ctdb_vnn *vnn;
2989 /* If we don't have public IPs, tickles are useless */
2990 if (ctdb->vnn == NULL) {
2994 vnn = find_public_ip_vnn(ctdb, &p->dst);
2996 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2997 ctdb_addr_to_str(&p->dst)));
3003 tcparray = vnn->tcp_array;
3005 /* If this is the first tickle */
3006 if (tcparray == NULL) {
3007 tcparray = talloc(vnn, struct ctdb_tcp_array);
3008 CTDB_NO_MEMORY(ctdb, tcparray);
3009 vnn->tcp_array = tcparray;
3012 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
3013 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3015 tcparray->connections[tcparray->num].src = p->src;
3016 tcparray->connections[tcparray->num].dst = p->dst;
3019 if (tcp_update_needed) {
3020 vnn->tcp_update_needed = true;
3026 /* Do we already have this tickle ?*/
3029 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3030 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3031 ctdb_addr_to_str(&tcp.dst),
3032 ntohs(tcp.dst.ip.sin_port),
3037 /* A new tickle, we must add it to the array */
3038 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3039 struct ctdb_connection,
3041 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3043 tcparray->connections[tcparray->num].src = p->src;
3044 tcparray->connections[tcparray->num].dst = p->dst;
3047 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3048 ctdb_addr_to_str(&tcp.dst),
3049 ntohs(tcp.dst.ip.sin_port),
3052 if (tcp_update_needed) {
3053 vnn->tcp_update_needed = true;
3060 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
3062 struct ctdb_connection *tcpp;
3068 /* if the array is empty we cant remove it
3069 and we don't need to do anything
3071 if (vnn->tcp_array == NULL) {
3072 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3073 ctdb_addr_to_str(&conn->dst),
3074 ntohs(conn->dst.ip.sin_port)));
3079 /* See if we know this connection
3080 if we don't know this connection then we dont need to do anything
3082 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3084 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3085 ctdb_addr_to_str(&conn->dst),
3086 ntohs(conn->dst.ip.sin_port)));
3091 /* We need to remove this entry from the array.
3092 Instead of allocating a new array and copying data to it
3093 we cheat and just copy the last entry in the existing array
3094 to the entry that is to be removed and just shring the
3097 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3098 vnn->tcp_array->num--;
3100 /* If we deleted the last entry we also need to remove the entire array
3102 if (vnn->tcp_array->num == 0) {
3103 talloc_free(vnn->tcp_array);
3104 vnn->tcp_array = NULL;
3107 vnn->tcp_update_needed = true;
3109 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3110 ctdb_addr_to_str(&conn->src),
3111 ntohs(conn->src.ip.sin_port)));
3116 called by a daemon to inform us of a TCP connection that one of its
3117 clients used are no longer needed in the tickle database
3119 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3121 struct ctdb_vnn *vnn;
3122 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
3124 /* If we don't have public IPs, tickles are useless */
3125 if (ctdb->vnn == NULL) {
3129 vnn = find_public_ip_vnn(ctdb, &conn->dst);
3132 (__location__ " unable to find public address %s\n",
3133 ctdb_addr_to_str(&conn->dst)));
3137 ctdb_remove_connection(vnn, conn);
3144 Called when another daemon starts - causes all tickles for all
3145 public addresses we are serving to be sent to the new node on the
3146 next check. This actually causes the next scheduled call to
3147 tdb_update_tcp_tickles() to update all nodes. This is simple and
3148 doesn't require careful error handling.
3150 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3152 struct ctdb_vnn *vnn;
3154 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3155 (unsigned long) pnn));
3157 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3158 vnn->tcp_update_needed = true;
3166 called when a client structure goes away - hook to remove
3167 elements from the tcp_list in all daemons
3169 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3171 while (client->tcp_list) {
3172 struct ctdb_vnn *vnn;
3173 struct ctdb_tcp_list *tcp = client->tcp_list;
3174 struct ctdb_connection *conn = &tcp->connection;
3176 DLIST_REMOVE(client->tcp_list, tcp);
3178 vnn = find_public_ip_vnn(client->ctdb,
3182 (__location__ " unable to find public address %s\n",
3183 ctdb_addr_to_str(&conn->dst)));
3187 /* If the IP address is hosted on this node then
3188 * remove the connection. */
3189 if (vnn->pnn == client->ctdb->pnn) {
3190 ctdb_remove_connection(vnn, conn);
3193 /* Otherwise this function has been called because the
3194 * server IP address has been released to another node
3195 * and the client has exited. This means that we
3196 * should not delete the connection information. The
3197 * takeover node processes connections too. */
3202 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3204 struct ctdb_vnn *vnn;
3207 if (ctdb->tunable.disable_ip_failover == 1) {
3211 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3212 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3213 ctdb_vnn_unassign_iface(ctdb, vnn);
3220 /* Don't allow multiple releases at once. Some code,
3221 * particularly ctdb_tickle_sentenced_connections() is
3223 if (vnn->update_in_flight) {
3224 DEBUG(DEBUG_WARNING,
3226 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3227 ctdb_addr_to_str(&vnn->public_address),
3228 vnn->public_netmask_bits,
3229 ctdb_vnn_iface_string(vnn)));
3232 vnn->update_in_flight = true;
3234 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3235 ctdb_addr_to_str(&vnn->public_address),
3236 vnn->public_netmask_bits,
3237 ctdb_vnn_iface_string(vnn)));
3239 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3240 ctdb_vnn_iface_string(vnn),
3241 ctdb_addr_to_str(&vnn->public_address),
3242 vnn->public_netmask_bits);
3243 release_kill_clients(ctdb, &vnn->public_address);
3244 ctdb_vnn_unassign_iface(ctdb, vnn);
3245 vnn->update_in_flight = false;
3249 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3254 get list of public IPs
3256 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3257 struct ctdb_req_control_old *c, TDB_DATA *outdata)
3260 struct ctdb_public_ip_list_old *ips;
3261 struct ctdb_vnn *vnn;
3262 bool only_available = false;
3264 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3265 only_available = true;
3268 /* count how many public ip structures we have */
3270 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3274 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3275 num*sizeof(struct ctdb_public_ip);
3276 ips = talloc_zero_size(outdata, len);
3277 CTDB_NO_MEMORY(ctdb, ips);
3280 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3281 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3284 ips->ips[i].pnn = vnn->pnn;
3285 ips->ips[i].addr = vnn->public_address;
3289 len = offsetof(struct ctdb_public_ip_list_old, ips) +
3290 i*sizeof(struct ctdb_public_ip);
3292 outdata->dsize = len;
3293 outdata->dptr = (uint8_t *)ips;
3299 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3300 struct ctdb_req_control_old *c,
3305 ctdb_sock_addr *addr;
3306 struct ctdb_public_ip_info_old *info;
3307 struct ctdb_vnn *vnn;
3309 addr = (ctdb_sock_addr *)indata.dptr;
3311 vnn = find_public_ip_vnn(ctdb, addr);
3313 /* if it is not a public ip it could be our 'single ip' */
3314 if (ctdb->single_ip_vnn) {
3315 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3316 vnn = ctdb->single_ip_vnn;
3321 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3322 "'%s'not a public address\n",
3323 ctdb_addr_to_str(addr)));
3327 /* count how many public ip structures we have */
3329 for (;vnn->ifaces[num];) {
3333 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3334 num*sizeof(struct ctdb_iface);
3335 info = talloc_zero_size(outdata, len);
3336 CTDB_NO_MEMORY(ctdb, info);
3338 info->ip.addr = vnn->public_address;
3339 info->ip.pnn = vnn->pnn;
3340 info->active_idx = 0xFFFFFFFF;
3342 for (i=0; vnn->ifaces[i]; i++) {
3343 struct ctdb_interface *cur;
3345 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3347 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3351 if (vnn->iface == cur) {
3352 info->active_idx = i;
3354 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3355 info->ifaces[i].link_state = cur->link_up;
3356 info->ifaces[i].references = cur->references;
3359 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
3360 i*sizeof(struct ctdb_iface);
3362 outdata->dsize = len;
3363 outdata->dptr = (uint8_t *)info;
3368 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3369 struct ctdb_req_control_old *c,
3373 struct ctdb_iface_list_old *ifaces;
3374 struct ctdb_interface *cur;
3376 /* count how many public ip structures we have */
3378 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3382 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3383 num*sizeof(struct ctdb_iface);
3384 ifaces = talloc_zero_size(outdata, len);
3385 CTDB_NO_MEMORY(ctdb, ifaces);
3388 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3389 strcpy(ifaces->ifaces[i].name, cur->name);
3390 ifaces->ifaces[i].link_state = cur->link_up;
3391 ifaces->ifaces[i].references = cur->references;
3395 len = offsetof(struct ctdb_iface_list_old, ifaces) +
3396 i*sizeof(struct ctdb_iface);
3398 outdata->dsize = len;
3399 outdata->dptr = (uint8_t *)ifaces;
3404 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3405 struct ctdb_req_control_old *c,
3408 struct ctdb_iface *info;
3409 struct ctdb_interface *iface;
3410 bool link_up = false;
3412 info = (struct ctdb_iface *)indata.dptr;
3414 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3415 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3416 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3417 len, len, info->name));
3421 switch (info->link_state) {
3429 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3430 (unsigned int)info->link_state));
3434 if (info->references != 0) {
3435 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3436 (unsigned int)info->references));
3440 iface = ctdb_find_iface(ctdb, info->name);
3441 if (iface == NULL) {
3445 if (link_up == iface->link_up) {
3449 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3450 ("iface[%s] has changed it's link status %s => %s\n",
3452 iface->link_up?"up":"down",
3453 link_up?"up":"down"));
3455 iface->link_up = link_up;
3461 structure containing the listening socket and the list of tcp connections
3462 that the ctdb daemon is to kill
3464 struct ctdb_kill_tcp {
3465 struct ctdb_vnn *vnn;
3466 struct ctdb_context *ctdb;
3468 struct tevent_fd *fde;
3469 trbt_tree_t *connections;
3474 a tcp connection that is to be killed
3476 struct ctdb_killtcp_con {
3477 ctdb_sock_addr src_addr;
3478 ctdb_sock_addr dst_addr;
3480 struct ctdb_kill_tcp *killtcp;
3483 /* this function is used to create a key to represent this socketpair
3484 in the killtcp tree.
3485 this key is used to insert and lookup matching socketpairs that are
3486 to be tickled and RST
3488 #define KILLTCP_KEYLEN 10
3489 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3491 static uint32_t key[KILLTCP_KEYLEN];
3493 bzero(key, sizeof(key));
3495 if (src->sa.sa_family != dst->sa.sa_family) {
3496 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3500 switch (src->sa.sa_family) {
3502 key[0] = dst->ip.sin_addr.s_addr;
3503 key[1] = src->ip.sin_addr.s_addr;
3504 key[2] = dst->ip.sin_port;
3505 key[3] = src->ip.sin_port;
3508 uint32_t *dst6_addr32 =
3509 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3510 uint32_t *src6_addr32 =
3511 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3512 key[0] = dst6_addr32[3];
3513 key[1] = src6_addr32[3];
3514 key[2] = dst6_addr32[2];
3515 key[3] = src6_addr32[2];
3516 key[4] = dst6_addr32[1];
3517 key[5] = src6_addr32[1];
3518 key[6] = dst6_addr32[0];
3519 key[7] = src6_addr32[0];
3520 key[8] = dst->ip6.sin6_port;
3521 key[9] = src->ip6.sin6_port;
3525 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3533 called when we get a read event on the raw socket
3535 static void capture_tcp_handler(struct tevent_context *ev,
3536 struct tevent_fd *fde,
3537 uint16_t flags, void *private_data)
3539 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3540 struct ctdb_killtcp_con *con;
3541 ctdb_sock_addr src, dst;
3542 uint32_t ack_seq, seq;
3544 if (!(flags & TEVENT_FD_READ)) {
3548 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3549 killtcp->private_data,
3551 &ack_seq, &seq) != 0) {
3552 /* probably a non-tcp ACK packet */
3556 /* check if we have this guy in our list of connections
3559 con = trbt_lookuparray32(killtcp->connections,
3560 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3562 /* no this was some other packet we can just ignore */
3566 /* This one has been tickled !
3567 now reset him and remove him from the list.
3569 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3570 ntohs(con->dst_addr.ip.sin_port),
3571 ctdb_addr_to_str(&con->src_addr),
3572 ntohs(con->src_addr.ip.sin_port)));
3574 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3579 /* when traversing the list of all tcp connections to send tickle acks to
3580 (so that we can capture the ack coming back and kill the connection
3582 this callback is called for each connection we are currently trying to kill
3584 static int tickle_connection_traverse(void *param, void *data)
3586 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3588 /* have tried too many times, just give up */
3589 if (con->count >= 5) {
3590 /* can't delete in traverse: reparent to delete_cons */
3591 talloc_steal(param, con);
3595 /* othervise, try tickling it again */
3598 (ctdb_sock_addr *)&con->dst_addr,
3599 (ctdb_sock_addr *)&con->src_addr,
3606 called every second until all sentenced connections have been reset
3608 static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
3609 struct tevent_timer *te,
3610 struct timeval t, void *private_data)
3612 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3613 void *delete_cons = talloc_new(NULL);
3615 /* loop over all connections sending tickle ACKs */
3616 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3618 /* now we've finished traverse, it's safe to do deletion. */
3619 talloc_free(delete_cons);
3621 /* If there are no more connections to kill we can remove the
3622 entire killtcp structure
3624 if ( (killtcp->connections == NULL) ||
3625 (killtcp->connections->root == NULL) ) {
3626 talloc_free(killtcp);
3630 /* try tickling them again in a seconds time
3632 tevent_add_timer(killtcp->ctdb->ev, killtcp,
3633 timeval_current_ofs(1, 0),
3634 ctdb_tickle_sentenced_connections, killtcp);
3638 destroy the killtcp structure
3640 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3642 struct ctdb_vnn *tmpvnn;
3644 /* verify that this vnn is still active */
3645 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3646 if (tmpvnn == killtcp->vnn) {
3651 if (tmpvnn == NULL) {
3655 if (killtcp->vnn->killtcp != killtcp) {
3659 killtcp->vnn->killtcp = NULL;
3665 /* nothing fancy here, just unconditionally replace any existing
3666 connection structure with the new one.
3668 don't even free the old one if it did exist, that one is talloc_stolen
3669 by the same node in the tree anyway and will be deleted when the new data
3672 static void *add_killtcp_callback(void *parm, void *data)
3678 add a tcp socket to the list of connections we want to RST
3680 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3684 ctdb_sock_addr src, dst;
3685 struct ctdb_kill_tcp *killtcp;
3686 struct ctdb_killtcp_con *con;
3687 struct ctdb_vnn *vnn;
3689 ctdb_canonicalize_ip(s, &src);
3690 ctdb_canonicalize_ip(d, &dst);
3692 vnn = find_public_ip_vnn(ctdb, &dst);
3694 vnn = find_public_ip_vnn(ctdb, &src);
3697 /* if it is not a public ip it could be our 'single ip' */
3698 if (ctdb->single_ip_vnn) {
3699 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3700 vnn = ctdb->single_ip_vnn;
3705 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3709 killtcp = vnn->killtcp;
3711 /* If this is the first connection to kill we must allocate
3714 if (killtcp == NULL) {
3715 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3716 CTDB_NO_MEMORY(ctdb, killtcp);
3719 killtcp->ctdb = ctdb;
3720 killtcp->capture_fd = -1;
3721 killtcp->connections = trbt_create(killtcp, 0);
3723 vnn->killtcp = killtcp;
3724 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3729 /* create a structure that describes this connection we want to
3730 RST and store it in killtcp->connections
3732 con = talloc(killtcp, struct ctdb_killtcp_con);
3733 CTDB_NO_MEMORY(ctdb, con);
3734 con->src_addr = src;
3735 con->dst_addr = dst;
3737 con->killtcp = killtcp;
3740 trbt_insertarray32_callback(killtcp->connections,
3741 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3742 add_killtcp_callback, con);
3745 If we don't have a socket to listen on yet we must create it
3747 if (killtcp->capture_fd == -1) {
3748 const char *iface = ctdb_vnn_iface_string(vnn);
3749 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3750 if (killtcp->capture_fd == -1) {
3751 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3752 "socket on iface '%s' for killtcp (%s)\n",
3753 iface, strerror(errno)));
3759 if (killtcp->fde == NULL) {
3760 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
3761 killtcp->capture_fd,
3763 capture_tcp_handler, killtcp);
3764 tevent_fd_set_auto_close(killtcp->fde);
3766 /* We also need to set up some events to tickle all these connections
3767 until they are all reset
3769 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3770 ctdb_tickle_sentenced_connections, killtcp);
3773 /* tickle him once now */
3782 talloc_free(vnn->killtcp);
3783 vnn->killtcp = NULL;
3788 kill a TCP connection.
3790 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3792 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
3794 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
3798 called by a daemon to inform us of the entire list of TCP tickles for
3799 a particular public address.
3800 this control should only be sent by the node that is currently serving
3801 that public address.
3803 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3805 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
3806 struct ctdb_tcp_array *tcparray;
3807 struct ctdb_vnn *vnn;
3809 /* We must at least have tickles.num or else we cant verify the size
3810 of the received data blob
3812 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
3813 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
3817 /* verify that the size of data matches what we expect */
3818 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3819 + sizeof(struct ctdb_connection) * list->num) {
3820 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3824 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3825 ctdb_addr_to_str(&list->addr)));
3827 vnn = find_public_ip_vnn(ctdb, &list->addr);
3829 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3830 ctdb_addr_to_str(&list->addr)));
3835 if (vnn->pnn == ctdb->pnn) {
3837 ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
3838 ctdb_addr_to_str(&list->addr)));
3842 /* remove any old ticklelist we might have */
3843 talloc_free(vnn->tcp_array);
3844 vnn->tcp_array = NULL;
3846 tcparray = talloc(vnn, struct ctdb_tcp_array);
3847 CTDB_NO_MEMORY(ctdb, tcparray);
3849 tcparray->num = list->num;
3851 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3852 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3854 memcpy(tcparray->connections, &list->connections[0],
3855 sizeof(struct ctdb_connection)*tcparray->num);
3857 /* We now have a new fresh tickle list array for this vnn */
3858 vnn->tcp_array = tcparray;
3864 called to return the full list of tickles for the puclic address associated
3865 with the provided vnn
3867 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3869 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3870 struct ctdb_tickle_list_old *list;
3871 struct ctdb_tcp_array *tcparray;
3873 struct ctdb_vnn *vnn;
3875 vnn = find_public_ip_vnn(ctdb, addr);
3877 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3878 ctdb_addr_to_str(addr)));
3883 tcparray = vnn->tcp_array;
3885 num = tcparray->num;
3890 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3891 + sizeof(struct ctdb_connection) * num;
3893 outdata->dptr = talloc_size(outdata, outdata->dsize);
3894 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3895 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3900 memcpy(&list->connections[0], tcparray->connections,
3901 sizeof(struct ctdb_connection) * num);
3909 set the list of all tcp tickles for a public address
3911 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3912 ctdb_sock_addr *addr,
3913 struct ctdb_tcp_array *tcparray)
3917 struct ctdb_tickle_list_old *list;
3920 num = tcparray->num;
3925 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3926 sizeof(struct ctdb_connection) * num;
3927 data.dptr = talloc_size(ctdb, data.dsize);
3928 CTDB_NO_MEMORY(ctdb, data.dptr);
3930 list = (struct ctdb_tickle_list_old *)data.dptr;
3934 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3937 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3938 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3939 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3941 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3945 talloc_free(data.dptr);
3952 perform tickle updates if required
3954 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3955 struct tevent_timer *te,
3956 struct timeval t, void *private_data)
3958 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3960 struct ctdb_vnn *vnn;
3962 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3963 /* we only send out updates for public addresses that
3966 if (ctdb->pnn != vnn->pnn) {
3969 /* We only send out the updates if we need to */
3970 if (!vnn->tcp_update_needed) {
3973 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3974 &vnn->public_address,
3977 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3978 ctdb_addr_to_str(&vnn->public_address)));
3981 ("Sent tickle update for public address %s\n",
3982 ctdb_addr_to_str(&vnn->public_address)));
3983 vnn->tcp_update_needed = false;
3987 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3988 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3989 ctdb_update_tcp_tickles, ctdb);
3993 start periodic update of tcp tickles
3995 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3997 ctdb->tickle_update_context = talloc_new(ctdb);
3999 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
4000 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4001 ctdb_update_tcp_tickles, ctdb);
4007 struct control_gratious_arp {
4008 struct ctdb_context *ctdb;
4009 ctdb_sock_addr addr;
4015 send a control_gratuitous arp
4017 static void send_gratious_arp(struct tevent_context *ev,
4018 struct tevent_timer *te,
4019 struct timeval t, void *private_data)
4022 struct control_gratious_arp *arp = talloc_get_type(private_data,
4023 struct control_gratious_arp);
4025 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4027 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4028 arp->iface, strerror(errno)));
4033 if (arp->count == CTDB_ARP_REPEAT) {
4038 tevent_add_timer(arp->ctdb->ev, arp,
4039 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4040 send_gratious_arp, arp);
4047 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4049 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
4050 struct control_gratious_arp *arp;
4052 /* verify the size of indata */
4053 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4054 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4055 (unsigned)indata.dsize,
4056 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
4060 ( offsetof(struct ctdb_addr_info_old, iface)
4061 + gratious_arp->len ) ){
4063 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4064 "but should be %u bytes\n",
4065 (unsigned)indata.dsize,
4066 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
4071 arp = talloc(ctdb, struct control_gratious_arp);
4072 CTDB_NO_MEMORY(ctdb, arp);
4075 arp->addr = gratious_arp->addr;
4076 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4077 CTDB_NO_MEMORY(ctdb, arp->iface);
4080 tevent_add_timer(arp->ctdb->ev, arp,
4081 timeval_zero(), send_gratious_arp, arp);
4086 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4088 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4091 /* verify the size of indata */
4092 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4093 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4097 ( offsetof(struct ctdb_addr_info_old, iface)
4100 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4101 "but should be %u bytes\n",
4102 (unsigned)indata.dsize,
4103 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4107 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4109 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4112 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4119 struct delete_ip_callback_state {
4120 struct ctdb_req_control_old *c;
4124 called when releaseip event finishes for del_public_address
4126 static void delete_ip_callback(struct ctdb_context *ctdb,
4127 int32_t status, TDB_DATA data,
4128 const char *errormsg,
4131 struct delete_ip_callback_state *state =
4132 talloc_get_type(private_data, struct delete_ip_callback_state);
4134 /* If release failed then fail. */
4135 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4136 talloc_free(private_data);
4139 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4140 struct ctdb_req_control_old *c,
4141 TDB_DATA indata, bool *async_reply)
4143 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
4144 struct ctdb_vnn *vnn;
4146 /* verify the size of indata */
4147 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
4148 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
4152 ( offsetof(struct ctdb_addr_info_old, iface)
4155 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4156 "but should be %u bytes\n",
4157 (unsigned)indata.dsize,
4158 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
4162 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4164 /* walk over all public addresses until we find a match */
4165 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4166 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4167 if (vnn->pnn == ctdb->pnn) {
4168 struct delete_ip_callback_state *state;
4169 struct ctdb_public_ip *ip;
4173 vnn->delete_pending = true;
4175 state = talloc(ctdb,
4176 struct delete_ip_callback_state);
4177 CTDB_NO_MEMORY(ctdb, state);
4180 ip = talloc(state, struct ctdb_public_ip);
4183 (__location__ " Out of memory\n"));
4188 ip->addr = pub->addr;
4190 data.dsize = sizeof(struct ctdb_public_ip);
4191 data.dptr = (unsigned char *)ip;
4193 ret = ctdb_daemon_send_control(ctdb,
4196 CTDB_CONTROL_RELEASE_IP,
4203 (__location__ "Unable to send "
4204 "CTDB_CONTROL_RELEASE_IP\n"));
4209 state->c = talloc_steal(state, c);
4210 *async_reply = true;
4212 /* This IP is not hosted on the
4213 * current node so just delete it
4215 do_delete_ip(ctdb, vnn);
4222 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4223 ctdb_addr_to_str(&pub->addr)));
4228 struct ipreallocated_callback_state {
4229 struct ctdb_req_control_old *c;
4232 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4233 int status, void *p)
4235 struct ipreallocated_callback_state *state =
4236 talloc_get_type(p, struct ipreallocated_callback_state);
4240 (" \"ipreallocated\" event script failed (status %d)\n",
4242 if (status == -ETIME) {
4243 ctdb_ban_self(ctdb);
4247 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4251 /* A control to run the ipreallocated event */
4252 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4253 struct ctdb_req_control_old *c,
4257 struct ipreallocated_callback_state *state;
4259 state = talloc(ctdb, struct ipreallocated_callback_state);
4260 CTDB_NO_MEMORY(ctdb, state);
4262 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4264 ret = ctdb_event_script_callback(ctdb, state,
4265 ctdb_ipreallocated_callback, state,
4266 CTDB_EVENT_IPREALLOCATED,
4270 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4275 /* tell the control that we will be reply asynchronously */
4276 state->c = talloc_steal(state, c);
4277 *async_reply = true;
4283 /* This function is called from the recovery daemon to verify that a remote
4284 node has the expected ip allocation.
4285 This is verified against ctdb->ip_tree
4287 static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4288 struct ctdb_public_ip_list_old *ips,
4291 struct public_ip_list *tmp_ip;
4294 if (ctdb->ip_tree == NULL) {
4295 /* don't know the expected allocation yet, assume remote node
4304 for (i=0; i<ips->num; i++) {
4305 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4306 if (tmp_ip == NULL) {
4307 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4311 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4315 if (tmp_ip->pnn != ips->ips[i].pnn) {
4317 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4319 ctdb_addr_to_str(&ips->ips[i].addr),
4320 ips->ips[i].pnn, tmp_ip->pnn));
4328 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4330 struct public_ip_list *tmp_ip;
4332 /* IP tree is never built if DisableIPFailover is set */
4333 if (ctdb->tunable.disable_ip_failover != 0) {
4337 if (ctdb->ip_tree == NULL) {
4338 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4342 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4343 if (tmp_ip == NULL) {
4344 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4348 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4349 tmp_ip->pnn = ip->pnn;
4354 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4356 TALLOC_FREE(ctdb->ip_tree);
4359 struct ctdb_reloadips_handle {
4360 struct ctdb_context *ctdb;
4361 struct ctdb_req_control_old *c;
4365 struct tevent_fd *fde;
4368 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4370 if (h == h->ctdb->reload_ips) {
4371 h->ctdb->reload_ips = NULL;
4374 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4377 ctdb_kill(h->ctdb, h->child, SIGKILL);
4381 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
4382 struct tevent_timer *te,
4383 struct timeval t, void *private_data)
4385 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4390 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
4391 struct tevent_fd *fde,
4392 uint16_t flags, void *private_data)
4394 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4399 ret = sys_read(h->fd[0], &res, 1);
4400 if (ret < 1 || res != 0) {
4401 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4409 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4411 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4412 struct ctdb_public_ip_list_old *ips;
4413 struct ctdb_vnn *vnn;
4414 struct client_async_data *async_data;
4415 struct timeval timeout;
4417 struct ctdb_client_control_state *state;
4421 CTDB_NO_MEMORY(ctdb, mem_ctx);
4423 /* Read IPs from local node */
4424 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4425 CTDB_CURRENT_NODE, mem_ctx, &ips);
4428 ("Unable to fetch public IPs from local node\n"));
4429 talloc_free(mem_ctx);
4433 /* Read IPs file - this is safe since this is a child process */
4435 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4436 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4437 talloc_free(mem_ctx);
4441 async_data = talloc_zero(mem_ctx, struct client_async_data);
4442 CTDB_NO_MEMORY(ctdb, async_data);
4444 /* Compare IPs between node and file for IPs to be deleted */
4445 for (i = 0; i < ips->num; i++) {
4447 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4448 if (ctdb_same_ip(&vnn->public_address,
4449 &ips->ips[i].addr)) {
4450 /* IP is still in file */
4456 /* Delete IP ips->ips[i] */
4457 struct ctdb_addr_info_old *pub;
4460 ("IP %s no longer configured, deleting it\n",
4461 ctdb_addr_to_str(&ips->ips[i].addr)));
4463 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
4464 CTDB_NO_MEMORY(ctdb, pub);
4466 pub->addr = ips->ips[i].addr;
4470 timeout = TAKEOVER_TIMEOUT();
4472 data.dsize = offsetof(struct ctdb_addr_info_old,
4474 data.dptr = (uint8_t *)pub;
4476 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4477 CTDB_CONTROL_DEL_PUBLIC_IP,
4478 0, data, async_data,
4480 if (state == NULL) {
4483 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4487 ctdb_client_async_add(async_data, state);
4491 /* Compare IPs between node and file for IPs to be added */
4493 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4494 for (i = 0; i < ips->num; i++) {
4495 if (ctdb_same_ip(&vnn->public_address,
4496 &ips->ips[i].addr)) {
4497 /* IP already on node */
4501 if (i == ips->num) {
4502 /* Add IP ips->ips[i] */
4503 struct ctdb_addr_info_old *pub;
4504 const char *ifaces = NULL;
4509 ("New IP %s configured, adding it\n",
4510 ctdb_addr_to_str(&vnn->public_address)));
4512 uint32_t pnn = ctdb_get_pnn(ctdb);
4514 data.dsize = sizeof(pnn);
4515 data.dptr = (uint8_t *)&pnn;
4517 ret = ctdb_client_send_message(
4519 CTDB_BROADCAST_CONNECTED,
4520 CTDB_SRVID_REBALANCE_NODE,
4523 DEBUG(DEBUG_WARNING,
4524 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4530 ifaces = vnn->ifaces[0];
4532 while (vnn->ifaces[iface] != NULL) {
4533 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4534 vnn->ifaces[iface]);
4538 len = strlen(ifaces) + 1;
4539 pub = talloc_zero_size(mem_ctx,
4540 offsetof(struct ctdb_addr_info_old, iface) + len);
4541 CTDB_NO_MEMORY(ctdb, pub);
4543 pub->addr = vnn->public_address;
4544 pub->mask = vnn->public_netmask_bits;
4546 memcpy(&pub->iface[0], ifaces, pub->len);
4548 timeout = TAKEOVER_TIMEOUT();
4550 data.dsize = offsetof(struct ctdb_addr_info_old,
4552 data.dptr = (uint8_t *)pub;
4554 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4555 CTDB_CONTROL_ADD_PUBLIC_IP,
4556 0, data, async_data,
4558 if (state == NULL) {
4561 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4565 ctdb_client_async_add(async_data, state);
4569 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4570 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4574 talloc_free(mem_ctx);
4578 talloc_free(mem_ctx);
4582 /* This control is sent to force the node to re-read the public addresses file
4583 and drop any addresses we should nnot longer host, and add new addresses
4584 that we are now able to host
4586 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
4588 struct ctdb_reloadips_handle *h;
4589 pid_t parent = getpid();
4591 if (ctdb->reload_ips != NULL) {
4592 talloc_free(ctdb->reload_ips);
4593 ctdb->reload_ips = NULL;
4596 h = talloc(ctdb, struct ctdb_reloadips_handle);
4597 CTDB_NO_MEMORY(ctdb, h);
4602 if (pipe(h->fd) == -1) {
4603 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4608 h->child = ctdb_fork(ctdb);
4609 if (h->child == (pid_t)-1) {
4610 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4618 if (h->child == 0) {
4619 signed char res = 0;
4622 debug_extra = talloc_asprintf(NULL, "reloadips:");
4624 prctl_set_comment("ctdb_reloadips");
4625 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4626 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4629 res = ctdb_reloadips_child(ctdb);
4631 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4635 sys_write(h->fd[1], &res, 1);
4636 /* make sure we die when our parent dies */
4637 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4643 h->c = talloc_steal(h, c);
4646 set_close_on_exec(h->fd[0]);
4648 talloc_set_destructor(h, ctdb_reloadips_destructor);
4651 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
4652 ctdb_reloadips_child_handler, (void *)h);
4653 tevent_fd_set_auto_close(h->fde);
4655 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
4656 ctdb_reloadips_timeout_event, h);
4658 /* we reply later */
4659 *async_reply = true;