Copyright (C) Ronnie Sahlberg 2007
Copyright (C) Andrew Tridgell 2007
+ Copyright (C) Martin Schwenke 2011
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
#include "lib/tdb/include/tdb.h"
#include "lib/util/dlinklist.h"
#include "system/network.h"
#define CTDB_ARP_INTERVAL 1
#define CTDB_ARP_REPEAT 3
+struct ctdb_iface {
+ struct ctdb_iface *prev, *next;
+ const char *name;
+ bool link_up;
+ uint32_t references;
+};
+
+static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+{
+ if (vnn->iface) {
+ return vnn->iface->name;
+ }
+
+ return "__none__";
+}
+
+static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
+{
+ struct ctdb_iface *i;
+
+ /* Verify that we dont have an entry for this ip yet */
+ for (i=ctdb->ifaces;i;i=i->next) {
+ if (strcmp(i->name, iface) == 0) {
+ return 0;
+ }
+ }
+
+ /* create a new structure for this interface */
+ i = talloc_zero(ctdb, struct ctdb_iface);
+ CTDB_NO_MEMORY_FATAL(ctdb, i);
+ i->name = talloc_strdup(i, iface);
+ CTDB_NO_MEMORY(ctdb, i->name);
+ i->link_up = false;
+
+ DLIST_ADD(ctdb->ifaces, i);
+
+ return 0;
+}
+
+static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
+ const char *iface)
+{
+ struct ctdb_iface *i;
+
+ /* Verify that we dont have an entry for this ip yet */
+ for (i=ctdb->ifaces;i;i=i->next) {
+ if (strcmp(i->name, iface) == 0) {
+ return i;
+ }
+ }
+
+ return NULL;
+}
+
+static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ int i;
+ struct ctdb_iface *cur = NULL;
+ struct ctdb_iface *best = NULL;
+
+ for (i=0; vnn->ifaces[i]; i++) {
+
+ cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+ if (cur == NULL) {
+ continue;
+ }
+
+ if (!cur->link_up) {
+ continue;
+ }
+
+ if (best == NULL) {
+ best = cur;
+ continue;
+ }
+
+ if (cur->references < best->references) {
+ best = cur;
+ continue;
+ }
+ }
+
+ return best;
+}
+
+static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct ctdb_iface *best = NULL;
+
+ if (vnn->iface) {
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "still assigned to iface '%s'\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn)));
+ return 0;
+ }
+
+ best = ctdb_vnn_best_iface(ctdb, vnn);
+ if (best == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
+ "cannot assign to iface any iface\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ return -1;
+ }
+
+ vnn->iface = best;
+ best->references++;
+ vnn->pnn = ctdb->pnn;
+
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "now assigned to iface '%s' refs[%d]\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn),
+ best->references));
+ return 0;
+}
+
+static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+ "now unassigned (old iface '%s' refs[%d])\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn),
+ vnn->iface?vnn->iface->references:0));
+ if (vnn->iface) {
+ vnn->iface->references--;
+ }
+ vnn->iface = NULL;
+ if (vnn->pnn == ctdb->pnn) {
+ vnn->pnn = -1;
+ }
+}
+
+static bool ctdb_vnn_available(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ int i;
+
+ if (vnn->iface && vnn->iface->link_up) {
+ return true;
+ }
+
+ for (i=0; vnn->ifaces[i]; i++) {
+ struct ctdb_iface *cur;
+
+ cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+ if (cur == NULL) {
+ continue;
+ }
+
+ if (cur->link_up) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
struct ctdb_takeover_arp {
struct ctdb_context *ctdb;
uint32_t count;
struct ctdb_takeover_arp);
int i, ret;
struct ctdb_tcp_array *tcparray;
+ const char *iface = ctdb_vnn_iface_string(arp->vnn);
- ret = ctdb_sys_send_arp(&arp->addr, arp->vnn->iface);
+ ret = ctdb_sys_send_arp(&arp->addr, iface);
if (ret != 0) {
- DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
+ DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
+ iface, strerror(errno)));
}
tcparray = arp->tcparray;
}
event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
- timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
+ timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
ctdb_control_send_arp, arp);
}
+static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
+ struct ctdb_vnn *vnn)
+{
+ struct ctdb_takeover_arp *arp;
+ struct ctdb_tcp_array *tcparray;
+
+ if (!vnn->takeover_ctx) {
+ vnn->takeover_ctx = talloc_new(vnn);
+ if (!vnn->takeover_ctx) {
+ return -1;
+ }
+ }
+
+ arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
+ if (!arp) {
+ return -1;
+ }
+
+ arp->ctdb = ctdb;
+ arp->addr = vnn->public_address;
+ arp->vnn = vnn;
+
+ tcparray = vnn->tcp_array;
+ if (tcparray) {
+ /* add all of the known tcp connections for this IP to the
+ list of tcp connections to send tickle acks for */
+ arp->tcparray = talloc_steal(arp, tcparray);
+
+ vnn->tcp_array = NULL;
+ vnn->tcp_update_needed = true;
+ }
+
+ event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
+ timeval_zero(), ctdb_control_send_arp, arp);
+
+ return 0;
+}
+
struct takeover_callback_state {
struct ctdb_req_control *c;
ctdb_sock_addr *addr;
struct ctdb_vnn *vnn;
};
+struct ctdb_do_takeip_state {
+ struct ctdb_req_control *c;
+ struct ctdb_vnn *vnn;
+};
+
/*
called when takeip event finishes
*/
-static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
- void *private_data)
+static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
{
- struct takeover_callback_state *state =
- talloc_get_type(private_data, struct takeover_callback_state);
- struct ctdb_takeover_arp *arp;
- struct ctdb_tcp_array *tcparray;
+ struct ctdb_do_takeip_state *state =
+ talloc_get_type(private_data, struct ctdb_do_takeip_state);
+ int32_t ret;
+ TDB_DATA data;
if (status != 0) {
+ struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+
+ if (status == -ETIME) {
+ ctdb_ban_self(ctdb);
+ }
DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
- ctdb_addr_to_str(state->addr),
- state->vnn->iface));
+ ctdb_addr_to_str(&state->vnn->public_address),
+ ctdb_vnn_iface_string(state->vnn)));
ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+ node->flags |= NODE_FLAGS_UNHEALTHY;
talloc_free(state);
return;
}
- if (!state->vnn->takeover_ctx) {
- state->vnn->takeover_ctx = talloc_new(ctdb);
- if (!state->vnn->takeover_ctx) {
- goto failed;
- }
+ ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+ if (ret != 0) {
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ talloc_free(state);
+ return;
}
- arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
- if (!arp) goto failed;
-
- arp->ctdb = ctdb;
- arp->addr = *state->addr;
- arp->vnn = state->vnn;
-
- tcparray = state->vnn->tcp_array;
- if (tcparray) {
- /* add all of the known tcp connections for this IP to the
- list of tcp connections to send tickle acks for */
- arp->tcparray = talloc_steal(arp, tcparray);
+ data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
+ data.dsize = strlen((char *)data.dptr) + 1;
+ DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
- state->vnn->tcp_array = NULL;
- state->vnn->tcp_update_needed = true;
- }
+ ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
- event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx,
- timeval_zero(), ctdb_control_send_arp, arp);
/* the control succeeded */
ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
talloc_free(state);
return;
+}
-failed:
- ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+/*
+ take over an ip address
+ */
+static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ struct ctdb_vnn *vnn)
+{
+ int ret;
+ struct ctdb_do_takeip_state *state;
+
+ ret = ctdb_vnn_assign_iface(ctdb, vnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
+ "assin a usable interface\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits));
+ return -1;
+ }
+
+ state = talloc(vnn, struct ctdb_do_takeip_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = talloc_steal(ctdb, c);
+ state->vnn = vnn;
+
+ DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+
+ ret = ctdb_event_script_callback(ctdb,
+ state,
+ ctdb_do_takeip_callback,
+ state,
+ false,
+ CTDB_EVENT_TAKE_IP,
+ "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn)));
+ talloc_free(state);
+ return -1;
+ }
+
+ return 0;
+}
+
+struct ctdb_do_updateip_state {
+ struct ctdb_req_control *c;
+ struct ctdb_iface *old;
+ struct ctdb_vnn *vnn;
+};
+
+/*
+ called when updateip event finishes
+ */
+static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
+ void *private_data)
+{
+ struct ctdb_do_updateip_state *state =
+ talloc_get_type(private_data, struct ctdb_do_updateip_state);
+ int32_t ret;
+
+ if (status != 0) {
+ if (status == -ETIME) {
+ ctdb_ban_self(ctdb);
+ }
+ DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
+ ctdb_addr_to_str(&state->vnn->public_address),
+ state->old->name,
+ ctdb_vnn_iface_string(state->vnn)));
+
+ /*
+ * All we can do is reset the old interface
+ * and let the next run fix it
+ */
+ ctdb_vnn_unassign_iface(ctdb, state->vnn);
+ state->vnn->iface = state->old;
+ state->vnn->iface->references++;
+
+ ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+ talloc_free(state);
+ return;
+ }
+
+ ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+ if (ret != 0) {
+ ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+ talloc_free(state);
+ return;
+ }
+
+ /* the control succeeded */
+ ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
talloc_free(state);
return;
}
+/*
+ update (move) an ip address
+ */
+static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ struct ctdb_vnn *vnn)
+{
+ int ret;
+ struct ctdb_do_updateip_state *state;
+ struct ctdb_iface *old = vnn->iface;
+ char *new_name;
+
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ ret = ctdb_vnn_assign_iface(ctdb, vnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
+ "assin a usable interface (old iface '%s')\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ old->name));
+ return -1;
+ }
+
+ new_name = ctdb_vnn_iface_string(vnn);
+ if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
+ /* A benign update from one interface onto itself.
+ * no need to run the eventscripts in this case, just return
+ * success.
+ */
+ ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
+ return 0;
+ }
+
+ state = talloc(vnn, struct ctdb_do_updateip_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ state->c = talloc_steal(ctdb, c);
+ state->old = old;
+ state->vnn = vnn;
+
+ DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
+ "interface %s to %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ old->name,
+ new_name));
+
+ ret = ctdb_event_script_callback(ctdb,
+ state,
+ ctdb_do_updateip_callback,
+ state,
+ false,
+ CTDB_EVENT_UPDATE_IP,
+ "%s %s %s %u",
+ state->old->name,
+ new_name,
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ old->name, new_name));
+ talloc_free(state);
+ return -1;
+ }
+
+ return 0;
+}
+
/*
Find the vnn of the node that has a public ip address
returns -1 if the address is not known as a public address
return NULL;
}
-
/*
take over an ip address
*/
-int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
struct ctdb_req_control *c,
- TDB_DATA indata,
+ TDB_DATA indata,
bool *async_reply)
{
int ret;
- struct takeover_callback_state *state;
struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
struct ctdb_vnn *vnn;
+ bool have_ip = false;
+ bool do_updateip = false;
+ bool do_takeip = false;
+ struct ctdb_iface *best_iface = NULL;
+
+ if (pip->pnn != ctdb->pnn) {
+ DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
+ "with pnn %d, but we're node %d\n",
+ ctdb_addr_to_str(&pip->addr),
+ pip->pnn, ctdb->pnn));
+ return -1;
+ }
/* update out vnn list */
vnn = find_public_ip_vnn(ctdb, &pip->addr);
if (vnn == NULL) {
- DEBUG(DEBUG_ERR,("takeoverip called for an ip '%s' that is not a public address\n",
+ DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
ctdb_addr_to_str(&pip->addr)));
return 0;
}
- vnn->pnn = pip->pnn;
- /* if our kernel already has this IP, do nothing */
- if (ctdb_sys_have_ip(&pip->addr)) {
- return 0;
+ have_ip = ctdb_sys_have_ip(&pip->addr);
+ best_iface = ctdb_vnn_best_iface(ctdb, vnn);
+ if (best_iface == NULL) {
+ DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
+ "a usable interface (old %s, have_ip %d)\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn),
+ have_ip));
+ return -1;
}
- state = talloc(ctdb, struct takeover_callback_state);
- CTDB_NO_MEMORY(ctdb, state);
+ if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
+ DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
+ have_ip = false;
+ }
- state->c = talloc_steal(ctdb, c);
- state->addr = talloc(ctdb, ctdb_sock_addr);
- CTDB_NO_MEMORY(ctdb, state->addr);
+ if (vnn->iface == NULL && have_ip) {
+ DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+ "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
+ ctdb_addr_to_str(&vnn->public_address)));
+ return 0;
+ }
- *state->addr = pip->addr;
- state->vnn = vnn;
+ if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
+ DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+ "and we have it on iface[%s], but it was assigned to node %d"
+ "and we are node %d, banning ourself\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
+ ctdb_ban_self(ctdb);
+ return -1;
+ }
- DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
- ctdb_addr_to_str(&pip->addr),
- vnn->public_netmask_bits,
- vnn->iface));
+ if (vnn->pnn == -1 && have_ip) {
+ vnn->pnn = ctdb->pnn;
+ DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+ "and we already have it on iface[%s], update local daemon\n",
+ ctdb_addr_to_str(&vnn->public_address),
+ ctdb_vnn_iface_string(vnn)));
+ return 0;
+ }
- ret = ctdb_event_script_callback(ctdb,
- timeval_current_ofs(ctdb->tunable.script_timeout, 0),
- state, takeover_ip_callback, state,
- "takeip %s %s %u",
- vnn->iface,
- talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
- vnn->public_netmask_bits);
+ if (vnn->iface) {
+ if (vnn->iface->link_up) {
+ /* only move when the rebalance gains something */
+ if (vnn->iface->references > (best_iface->references + 1)) {
+ do_updateip = true;
+ }
+ } else if (vnn->iface != best_iface) {
+ do_updateip = true;
+ }
+ }
- if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+ if (!have_ip) {
+ if (do_updateip) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ do_updateip = false;
+ }
+ do_takeip = true;
+ }
+
+ if (do_takeip) {
+ ret = ctdb_do_takeip(ctdb, c, vnn);
+ if (ret != 0) {
+ return -1;
+ }
+ } else if (do_updateip) {
+ ret = ctdb_do_updateip(ctdb, c, vnn);
+ if (ret != 0) {
+ return -1;
+ }
+ } else {
+ /*
+ * The interface is up and the kernel known the ip
+ * => do nothing
+ */
+ DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
ctdb_addr_to_str(&pip->addr),
- vnn->iface));
- talloc_free(state);
- return -1;
+ vnn->public_netmask_bits,
+ ctdb_vnn_iface_string(vnn)));
+ return 0;
}
/* tell ctdb_control.c that we will be replying asynchronously */
talloc_get_type(private_data, struct takeover_callback_state);
TDB_DATA data;
+ if (status == -ETIME) {
+ ctdb_ban_self(ctdb);
+ }
+
/* send a message to all clients of this node telling them
that the cluster has been reconfigured and they should
release any sockets on this IP */
/* kill clients that have registered with this IP */
release_kill_clients(ctdb, state->addr);
-
- /* the control succeeded */
+
+ ctdb_vnn_unassign_iface(ctdb, state->vnn);
+
+ /* the control succeeded */
ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
talloc_free(state);
}
/* update our vnn list */
vnn = find_public_ip_vnn(ctdb, &pip->addr);
if (vnn == NULL) {
- DEBUG(DEBUG_ERR,("takeoverip called for an ip '%s' that is not a public address\n",
+ DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
ctdb_addr_to_str(&pip->addr)));
return 0;
}
vnn->takeover_ctx = NULL;
if (!ctdb_sys_have_ip(&pip->addr)) {
- DEBUG(DEBUG_INFO,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
+ DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
ctdb_addr_to_str(&pip->addr),
vnn->public_netmask_bits,
- vnn->iface));
+ ctdb_vnn_iface_string(vnn)));
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ return 0;
+ }
+
+ if (vnn->iface == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " release_ip of IP %s is known to the kernel, "
+ "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
+ ctdb_addr_to_str(&vnn->public_address)));
return 0;
}
- DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s\n",
+ DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
ctdb_addr_to_str(&pip->addr),
vnn->public_netmask_bits,
- vnn->iface));
+ ctdb_vnn_iface_string(vnn),
+ pip->pnn));
state = talloc(ctdb, struct takeover_callback_state);
CTDB_NO_MEMORY(ctdb, state);
state->vnn = vnn;
ret = ctdb_event_script_callback(ctdb,
- timeval_current_ofs(ctdb->tunable.script_timeout, 0),
state, release_ip_callback, state,
- "releaseip %s %s %u",
- vnn->iface,
- talloc_strdup(state, ctdb_addr_to_str(&pip->addr)),
+ false,
+ CTDB_EVENT_RELEASE_IP,
+ "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&pip->addr),
vnn->public_netmask_bits);
if (ret != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
ctdb_addr_to_str(&pip->addr),
- vnn->iface));
+ ctdb_vnn_iface_string(vnn)));
talloc_free(state);
return -1;
}
}
-static int ctdb_add_public_address(struct ctdb_context *ctdb, ctdb_sock_addr *addr, unsigned mask, const char *iface)
+static int ctdb_add_public_address(struct ctdb_context *ctdb,
+ ctdb_sock_addr *addr,
+ unsigned mask, const char *ifaces)
{
struct ctdb_vnn *vnn;
+ uint32_t num = 0;
+ char *tmp;
+ const char *iface;
+ int i;
+ int ret;
/* Verify that we dont have an entry for this ip yet */
for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
/* create a new vnn structure for this ip address */
vnn = talloc_zero(ctdb, struct ctdb_vnn);
CTDB_NO_MEMORY_FATAL(ctdb, vnn);
- vnn->iface = talloc_strdup(vnn, iface);
- CTDB_NO_MEMORY(ctdb, vnn->iface);
+ vnn->ifaces = talloc_array(vnn, const char *, num + 2);
+ tmp = talloc_strdup(vnn, ifaces);
+ CTDB_NO_MEMORY_FATAL(ctdb, tmp);
+ for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+ vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
+ CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
+ vnn->ifaces[num] = talloc_strdup(vnn, iface);
+ CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
+ num++;
+ }
+ talloc_free(tmp);
+ vnn->ifaces[num] = NULL;
vnn->public_address = *addr;
vnn->public_netmask_bits = mask;
vnn->pnn = -1;
-
+ if (ctdb_sys_have_ip(addr)) {
+ DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
+ vnn->pnn = ctdb->pnn;
+ }
+
+ for (i=0; vnn->ifaces[i]; i++) {
+ ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
+ "for public_address[%s]\n",
+ vnn->ifaces[i], ctdb_addr_to_str(addr)));
+ talloc_free(vnn);
+ return -1;
+ }
+ if (i == 0) {
+ vnn->iface = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+ }
+ }
+
DLIST_ADD(ctdb->vnn, vnn);
return 0;
}
-
/*
setup the event script directory
*/
unsigned mask;
ctdb_sock_addr addr;
const char *addrstr;
- const char *iface;
+ const char *ifaces;
char *tok, *line;
line = lines[i];
talloc_free(lines);
return -1;
}
- iface = ctdb->default_public_interface;
+ ifaces = ctdb->default_public_interface;
} else {
- iface = tok;
+ ifaces = tok;
}
- if (!addrstr || !parse_ip_mask(addrstr, iface, &addr, &mask)) {
+ if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
talloc_free(lines);
return -1;
}
- if (ctdb_add_public_address(ctdb, &addr, mask, iface)) {
+ if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
talloc_free(lines);
return -1;
return 0;
}
+int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
+ const char *iface,
+ const char *ip)
+{
+ struct ctdb_vnn *svnn;
+ struct ctdb_iface *cur = NULL;
+ bool ok;
+ int ret;
+ svnn = talloc_zero(ctdb, struct ctdb_vnn);
+ CTDB_NO_MEMORY(ctdb, svnn);
+ svnn->ifaces = talloc_array(svnn, const char *, 2);
+ CTDB_NO_MEMORY(ctdb, svnn->ifaces);
+ svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
+ CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
+ svnn->ifaces[1] = NULL;
-struct ctdb_public_ip_list {
- struct ctdb_public_ip_list *next;
- uint32_t pnn;
- ctdb_sock_addr addr;
-};
+ ok = parse_ip(ip, iface, 0, &svnn->public_address);
+ if (!ok) {
+ talloc_free(svnn);
+ return -1;
+ }
+
+ ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
+ if (ret != 0) {
+ DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
+ "for single_ip[%s]\n",
+ svnn->ifaces[0],
+ ctdb_addr_to_str(&svnn->public_address)));
+ talloc_free(svnn);
+ return -1;
+ }
+ /* assume the single public ip interface is initially "good" */
+ cur = ctdb_find_iface(ctdb, iface);
+ if (cur == NULL) {
+ DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
+ return -1;
+ }
+ cur->link_up = true;
+
+ ret = ctdb_vnn_assign_iface(ctdb, svnn);
+ if (ret != 0) {
+ talloc_free(svnn);
+ return -1;
+ }
+
+ ctdb->single_ip_vnn = svnn;
+ return 0;
+}
/* Given a physical node, return the number of
public addresses that is currently assigned to this node.
struct ctdb_all_public_ips *public_ips;
int i;
- public_ips = ctdb->nodes[pnn]->public_ips;
+ public_ips = ctdb->nodes[pnn]->available_public_ips;
if (public_ips == NULL) {
return -1;
return 0;
}
-struct ctdb_public_ip_list *
-add_ip_to_merged_list(struct ctdb_context *ctdb,
- TALLOC_CTX *tmp_ctx,
- struct ctdb_public_ip_list *ip_list,
- struct ctdb_public_ip *ip)
+#define IP_KEYLEN 4
+static uint32_t *ip_key(ctdb_sock_addr *ip)
{
- struct ctdb_public_ip_list *tmp_ip;
+ static uint32_t key[IP_KEYLEN];
- /* do we already have this ip in our merged list ?*/
- for (tmp_ip=ip_list;tmp_ip;tmp_ip=tmp_ip->next) {
+ bzero(key, sizeof(key));
- /* we already have this public ip in the list */
- if (ctdb_same_ip(&tmp_ip->addr, &ip->addr)) {
- return ip_list;
- }
+ switch (ip->sa.sa_family) {
+ case AF_INET:
+ key[3] = htonl(ip->ip.sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ key[0] = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
+ key[1] = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
+ key[2] = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
+ key[3] = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
+ break;
+ default:
+ DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+ return key;
+ }
+
+ return key;
+}
+
+static void *add_ip_callback(void *parm, void *data)
+{
+ struct ctdb_public_ip_list *this_ip = parm;
+ struct ctdb_public_ip_list *prev_ip = data;
+
+ if (prev_ip == NULL) {
+ return parm;
+ }
+ if (this_ip->pnn == -1) {
+ this_ip->pnn = prev_ip->pnn;
}
- /* this is a new public ip, we must add it to the list */
- tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
- CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
- tmp_ip->pnn = ip->pnn;
- tmp_ip->addr = ip->addr;
- tmp_ip->next = ip_list;
+ return parm;
+}
+
+void getips_count_callback(void *param, void *data)
+{
+ struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
+ struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
- return tmp_ip;
+ new_ip->next = *ip_list;
+ *ip_list = new_ip;
}
-struct ctdb_public_ip_list *
-create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
+static struct ctdb_public_ip_list *
+create_merged_ip_list(struct ctdb_context *ctdb)
{
int i, j;
- struct ctdb_public_ip_list *ip_list = NULL;
+ struct ctdb_public_ip_list *ip_list;
struct ctdb_all_public_ips *public_ips;
+ if (ctdb->ip_tree != NULL) {
+ talloc_free(ctdb->ip_tree);
+ ctdb->ip_tree = NULL;
+ }
+ ctdb->ip_tree = trbt_create(ctdb, 0);
+
for (i=0;i<ctdb->num_nodes;i++) {
- public_ips = ctdb->nodes[i]->public_ips;
+ public_ips = ctdb->nodes[i]->known_public_ips;
if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
continue;
}
for (j=0;j<public_ips->num;j++) {
- ip_list = add_ip_to_merged_list(ctdb, tmp_ctx,
- ip_list, &public_ips->ips[j]);
+ struct ctdb_public_ip_list *tmp_ip;
+
+ tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
+ CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
+ tmp_ip->pnn = public_ips->ips[j].pnn;
+ tmp_ip->addr = public_ips->ips[j].addr;
+ tmp_ip->next = NULL;
+
+ trbt_insertarray32_callback(ctdb->ip_tree,
+ IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
+ add_ip_callback,
+ tmp_ip);
}
}
+ ip_list = NULL;
+ trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
+
return ip_list;
}
-/*
- make any IP alias changes for public addresses that are necessary
+/*
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes. The implementation means that all
+ * addresses end up being 128 bits long.
+ * Not static, so we can easily link it into a unit test.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
*/
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
{
- int i, num_healthy, retries;
- struct ctdb_public_ip ip;
- struct ctdb_public_ipv4 ipv4;
- uint32_t mask;
- struct ctdb_public_ip_list *all_ips, *tmp_ip;
- int maxnode, maxnum=0, minnode, minnum=0, num;
- TDB_DATA data;
- struct timeval timeout;
- struct client_async_data *async_data;
- struct ctdb_client_control_state *state;
- TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-
+ uint32_t ip1_k[IP_KEYLEN];
+ uint32_t *t;
+ int i;
+ uint32_t x;
- ZERO_STRUCT(ip);
+ uint32_t distance = 0;
- /* Count how many completely healthy nodes we have */
- num_healthy = 0;
- for (i=0;i<nodemap->num;i++) {
- if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
- num_healthy++;
+ memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+ t = ip_key(ip2);
+ for (i=0; i<IP_KEYLEN; i++) {
+ x = ip1_k[i] ^ t[i];
+ if (x == 0) {
+ distance += 32;
+ } else {
+ /* Count number of leading zeroes.
+ * FIXME? This could be optimised...
+ */
+ while ((x & (1 << 31)) == 0) {
+ x <<= 1;
+ distance += 1;
+ }
}
}
- if (num_healthy > 0) {
- /* We have healthy nodes, so only consider them for
- serving public addresses
- */
- mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
- } else {
- /* We didnt have any completely healthy nodes so
- use "disabled" nodes as a fallback
- */
- mask = NODE_FLAGS_INACTIVE;
- }
-
- /* since nodes only know about those public addresses that
- can be served by that particular node, no single node has
- a full list of all public addresses that exist in the cluster.
- Walk over all node structures and create a merged list of
- all public addresses that exist in the cluster.
- */
- all_ips = create_merged_ip_list(ctdb, tmp_ctx);
+ return distance;
+}
- /* If we want deterministic ip allocations, i.e. that the ip addresses
- will always be allocated the same way for a specific set of
- available/unavailable nodes.
- */
- if (1 == ctdb->tunable.deterministic_public_ips) {
- DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
- for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
- tmp_ip->pnn = i%nodemap->num;
- }
- }
+/* Calculate the IP distance for the given IP relative to IPs on the
+ given node. The ips argument is generally the all_ips variable
+ used in the main part of the algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+ struct ctdb_public_ip_list *ips,
+ int pnn)
+{
+ struct ctdb_public_ip_list *t;
+ uint32_t d;
+ uint32_t sum = 0;
- /* mark all public addresses with a masked node as being served by
- node -1
- */
- for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
- if (tmp_ip->pnn == -1) {
+ for (t=ips; t != NULL; t=t->next) {
+ if (t->pnn != pnn) {
continue;
}
- if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
- tmp_ip->pnn = -1;
+
+ /* Optimisation: We never calculate the distance
+ * between an address and itself. This allows us to
+ * calculate the effect of removing an address from a
+ * node by simply calculating the distance between
+ * that address and all of the exitsing addresses.
+ * Moreover, we assume that we're only ever dealing
+ * with addresses from all_ips so we can identify an
+ * address via a pointer rather than doing a more
+ * expensive address comparison. */
+ if (&(t->addr) == ip) {
+ continue;
}
+
+ d = ip_distance(ip, &(t->addr));
+ sum += d * d; /* Cheaper than pulling in math.h :-) */
}
- /* verify that the assigned nodes can serve that public ip
- and set it to -1 if not
- */
- for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
- if (tmp_ip->pnn == -1) {
+ return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+ to the given node.
+ * Not static, so we can easily link it into a unit test.
+ */
+uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
+{
+ struct ctdb_public_ip_list *t;
+
+ uint32_t imbalance = 0;
+
+ for (t=all_ips; t!=NULL; t=t->next) {
+ if (t->pnn != pnn) {
continue;
}
- if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
- /* this node can not serve this ip. */
- tmp_ip->pnn = -1;
- }
+ /* Pass the rest of the IPs rather than the whole
+ all_ips input list.
+ */
+ imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
}
+ return imbalance;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ * Not static, so we can easily link it into a unit test.
+ */
+void basic_allocate_unassigned(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips)
+{
+ struct ctdb_public_ip_list *tmp_ip;
- /* now we must redistribute all public addresses with takeover node
- -1 among the nodes available
- */
- retries = 0;
-try_again:
/* loop over all ip's and find a physical node to cover for
each unassigned ip.
*/
}
}
}
+}
- /* If we dont want ips to fail back after a node becomes healthy
- again, we wont even try to reallocat the ip addresses so that
- they are evenly spread out.
- This can NOT be used at the same time as DeterministicIPs !
- */
- if (1 == ctdb->tunable.no_ip_failback) {
- if (1 == ctdb->tunable.deterministic_public_ips) {
- DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
- }
- goto finished;
- }
-
+/* Basic non-deterministic rebalancing algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+bool basic_failback(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ int num_ips,
+ int *retries)
+{
+ int i;
+ int maxnode, maxnum=0, minnode, minnum=0, num;
+ struct ctdb_public_ip_list *tmp_ip;
- /* now, try to make sure the ip adresses are evenly distributed
- across the node.
- for each ip address, loop over all nodes that can serve this
- ip and make sure that the difference between the node
- serving the most and the node serving the least ip's are not greater
- than 1.
+ /* for each ip address, loop over all nodes that can serve
+ this ip and make sure that the difference between the node
+ serving the most and the node serving the least ip's are
+ not greater than 1.
*/
for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
if (tmp_ip->pnn == -1) {
}
}
}
- if (maxnode == -1) {
- DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
- ctdb_addr_to_str(&tmp_ip->addr)));
+ if (maxnode == -1) {
+ DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+ ctdb_addr_to_str(&tmp_ip->addr)));
+
+ continue;
+ }
+
+ /* If we want deterministic IPs then dont try to reallocate
+ them to spread out the load.
+ */
+ if (1 == ctdb->tunable.deterministic_public_ips) {
+ continue;
+ }
+
+ /* if the spread between the smallest and largest coverage by
+ a node is >=2 we steal one of the ips from the node with
+ most coverage to even things out a bit.
+ try to do this a limited number of times since we dont
+ want to spend too much time balancing the ip coverage.
+ */
+ if ( (maxnum > minnum+1)
+ && (*retries < (num_ips + 5)) ){
+ struct ctdb_public_ip_list *tmp;
+
+ /* mark one of maxnode's vnn's as unassigned and try
+ again
+ */
+ for (tmp=all_ips;tmp;tmp=tmp->next) {
+ if (tmp->pnn == maxnode) {
+ tmp->pnn = -1;
+ (*retries)++;
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/* Do necessary LCP2 initialisation. Bury it in a function here so
+ * that we can unit test it.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_init(struct ctdb_context * tmp_ctx,
+ struct ctdb_node_map * nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t **lcp2_imbalances,
+ bool **newly_healthy)
+{
+ int i;
+ struct ctdb_public_ip_list *tmp_ip;
+
+ *newly_healthy = talloc_array(tmp_ctx, bool, nodemap->num);
+ CTDB_NO_MEMORY_FATAL(tmp_ctx, *newly_healthy);
+ *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+ CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
+
+ for (i=0;i<nodemap->num;i++) {
+ (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
+ /* First step: is the node "healthy"? */
+ (*newly_healthy)[i] = ! (bool)(nodemap->nodes[i].flags & mask);
+ }
+
+ /* 2nd step: if a ndoe has IPs assigned then it must have been
+ * healthy before, so we remove it from consideration... */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn != -1) {
+ (*newly_healthy)[tmp_ip->pnn] = false;
+ }
+ }
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ * Not static, so we can easily link it into a unit test.
+ */
+void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances)
+{
+ struct ctdb_public_ip_list *tmp_ip;
+ int dstnode;
+
+ int minnode;
+ uint32_t mindsum, dstdsum, dstimbl, minimbl;
+ struct ctdb_public_ip_list *minip;
+
+ bool should_loop = true;
+ bool have_unassigned = true;
+
+ while (have_unassigned && should_loop) {
+ should_loop = false;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+ minnode = -1;
+ mindsum = 0;
+ minip = NULL;
+
+ /* loop over each unassigned ip. */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn != -1) {
+ continue;
+ }
+
+ for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+ /* only check nodes that can actually serve this ip */
+ if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+ if (nodemap->nodes[dstnode].flags & mask) {
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
+ ctdb_addr_to_str(&(tmp_ip->addr)),
+ dstnode,
+ dstimbl - lcp2_imbalances[dstnode]));
+
+
+ if ((minnode == -1) || (dstdsum < mindsum)) {
+ minnode = dstnode;
+ minimbl = dstimbl;
+ mindsum = dstdsum;
+ minip = tmp_ip;
+ should_loop = true;
+ }
+ }
+ }
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ /* If we found one then assign it to the given node. */
+ if (minnode != -1) {
+ minip->pnn = minnode;
+ lcp2_imbalances[minnode] = minimbl;
+ DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+ ctdb_addr_to_str(&(minip->addr)),
+ minnode,
+ mindsum));
+ }
+
+ /* There might be a better way but at least this is clear. */
+ have_unassigned = false;
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ have_unassigned = true;
+ }
+ }
+ }
+
+ /* We know if we have an unassigned addresses so we might as
+ * well optimise.
+ */
+ if (have_unassigned) {
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
+ ctdb_addr_to_str(&tmp_ip->addr)));
+ }
+ }
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ *
+ * Not static, so we can easily link it into a unit test.
+ */
+bool lcp2_failback_candidate(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_public_ip_list *all_ips,
+ int srcnode,
+ uint32_t candimbl,
+ uint32_t *lcp2_imbalances,
+ bool *newly_healthy)
+{
+ int dstnode, mindstnode;
+ uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
+ uint32_t minsrcimbl, mindstimbl;
+ struct ctdb_public_ip_list *minip;
+ struct ctdb_public_ip_list *tmp_ip;
+
+ /* Find an IP and destination node that best reduces imbalance. */
+ minip = NULL;
+ minsrcimbl = 0;
+ mindstnode = -1;
+ mindstimbl = 0;
+
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+ DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
+
+ for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
+ /* Only consider addresses on srcnode. */
+ if (tmp_ip->pnn != srcnode) {
+ continue;
+ }
+
+ /* What is this IP address costing the source node? */
+ srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
+ srcimbl = candimbl - srcdsum;
+
+ /* Consider this IP address would cost each potential
+ * destination node. Destination nodes are limited to
+ * those that are newly healthy, since we don't want
+ * to do gratuitous failover of IPs just to make minor
+ * balance improvements.
+ */
+ for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+ if (! newly_healthy[dstnode]) {
+ continue;
+ }
+ /* only check nodes that can actually serve this ip */
+ if (can_node_serve_ip(ctdb, dstnode, tmp_ip)) {
+ /* no it couldnt so skip to the next node */
+ continue;
+ }
+
+ dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+ dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+ DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+ srcnode, srcimbl - lcp2_imbalances[srcnode],
+ ctdb_addr_to_str(&(tmp_ip->addr)),
+ dstnode, dstimbl - lcp2_imbalances[dstnode]));
+
+ if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
+ ((mindstnode == -1) || \
+ ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+ minip = tmp_ip;
+ minsrcimbl = srcimbl;
+ mindstnode = dstnode;
+ mindstimbl = dstimbl;
+ }
+ }
+ }
+ DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+ if (mindstnode != -1) {
+ /* We found a move that makes things better... */
+ DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
+ srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+ ctdb_addr_to_str(&(minip->addr)),
+ mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+ lcp2_imbalances[srcnode] = srcimbl;
+ lcp2_imbalances[mindstnode] = mindstimbl;
+ minip->pnn = mindstnode;
+
+ return true;
+ }
+
+ return false;
+
+}
+
+struct lcp2_imbalance_pnn {
+ uint32_t imbalance;
+ int pnn;
+};
+
+int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+ const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+ const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+ if (lipa->imbalance > lipb->imbalance) {
+ return -1;
+ } else if (lipa->imbalance == lipb->imbalance) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* LCP2 algorithm for rebalancing the cluster. This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ *
+ * Not static, so we can easily link it into a unit test.
+ */
+bool lcp2_failback(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ uint32_t mask,
+ struct ctdb_public_ip_list *all_ips,
+ uint32_t *lcp2_imbalances,
+ bool *newly_healthy)
+{
+ int i, num_newly_healthy;
+ struct lcp2_imbalance_pnn * lips;
+ bool ret;
+
+ /* It is only worth continuing if we have suitable target
+ * nodes to transfer IPs to. This check is much cheaper than
+ * continuing on...
+ */
+ num_newly_healthy = 0;
+ for (i = 0; i < nodemap->num; i++) {
+ if (newly_healthy[i]) {
+ num_newly_healthy++;
+ }
+ }
+ if (num_newly_healthy == 0) {
+ return false;
+ }
+
+ /* Put the imbalances and nodes into an array, sort them and
+ * iterate through candidates. Usually the 1st one will be
+ * used, so this doesn't cost much...
+ */
+ lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
+ for (i = 0; i < nodemap->num; i++) {
+ lips[i].imbalance = lcp2_imbalances[i];
+ lips[i].pnn = i;
+ }
+ qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
+ lcp2_cmp_imbalance_pnn);
+
+ ret = false;
+ for (i = 0; i < nodemap->num; i++) {
+ /* This means that all nodes had 0 or 1 addresses, so
+ * can't be imbalanced.
+ */
+ if (lips[i].imbalance == 0) {
+ break;
+ }
+
+ if (lcp2_failback_candidate(ctdb,
+ nodemap,
+ all_ips,
+ lips[i].pnn,
+ lips[i].imbalance,
+ lcp2_imbalances,
+ newly_healthy)) {
+ ret = true;
+ break;
+ }
+ }
+
+ talloc_free(lips);
+ return ret;
+}
+
+/* The calculation part of the IP allocation algorithm.
+ * Not static, so we can easily link it into a unit test.
+ */
+void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+ struct ctdb_node_map *nodemap,
+ struct ctdb_public_ip_list **all_ips_p)
+{
+ int i, num_healthy, retries, num_ips;
+ uint32_t mask;
+ struct ctdb_public_ip_list *all_ips, *tmp_ip;
+ uint32_t *lcp2_imbalances;
+ bool *newly_healthy;
+
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+ /* Count how many completely healthy nodes we have */
+ num_healthy = 0;
+ for (i=0;i<nodemap->num;i++) {
+ if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
+ num_healthy++;
+ }
+ }
+
+ if (num_healthy > 0) {
+ /* We have healthy nodes, so only consider them for
+ serving public addresses
+ */
+ mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
+ } else {
+ /* We didnt have any completely healthy nodes so
+ use "disabled" nodes as a fallback
+ */
+ mask = NODE_FLAGS_INACTIVE;
+ }
+
+ /* since nodes only know about those public addresses that
+ can be served by that particular node, no single node has
+ a full list of all public addresses that exist in the cluster.
+ Walk over all node structures and create a merged list of
+ all public addresses that exist in the cluster.
+
+ keep the tree of ips around as ctdb->ip_tree
+ */
+ all_ips = create_merged_ip_list(ctdb);
+ *all_ips_p = all_ips; /* minimal code changes */
+
+ /* Count how many ips we have */
+ num_ips = 0;
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ num_ips++;
+ }
+
+ /* If we want deterministic ip allocations, i.e. that the ip addresses
+ will always be allocated the same way for a specific set of
+ available/unavailable nodes.
+ */
+ if (1 == ctdb->tunable.deterministic_public_ips) {
+ DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+ for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
+ tmp_ip->pnn = i%nodemap->num;
+ }
+ }
+
+
+ /* mark all public addresses with a masked node as being served by
+ node -1
+ */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
+ continue;
+ }
+ if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
+ tmp_ip->pnn = -1;
+ }
+ }
+ /* verify that the assigned nodes can serve that public ip
+ and set it to -1 if not
+ */
+ for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+ if (tmp_ip->pnn == -1) {
continue;
}
+ if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
+ /* this node can not serve this ip. */
+ tmp_ip->pnn = -1;
+ }
+ }
- /* If we want deterministic IPs then dont try to reallocate
- them to spread out the load.
- */
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ lcp2_init(tmp_ctx, nodemap, mask, all_ips, &lcp2_imbalances, &newly_healthy);
+ }
+
+ /* now we must redistribute all public addresses with takeover node
+ -1 among the nodes available
+ */
+ retries = 0;
+try_again:
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ lcp2_allocate_unassigned(ctdb, nodemap, mask, all_ips, lcp2_imbalances);
+ } else {
+ basic_allocate_unassigned(ctdb, nodemap, mask, all_ips);
+ }
+
+ /* If we dont want ips to fail back after a node becomes healthy
+ again, we wont even try to reallocat the ip addresses so that
+ they are evenly spread out.
+ This can NOT be used at the same time as DeterministicIPs !
+ */
+ if (1 == ctdb->tunable.no_ip_failback) {
if (1 == ctdb->tunable.deterministic_public_ips) {
- continue;
+ DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
}
+ goto finished;
+ }
- /* if the spread between the smallest and largest coverage by
- a node is >=2 we steal one of the ips from the node with
- most coverage to even things out a bit.
- try to do this at most 5 times since we dont want to spend
- too much time balancing the ip coverage.
- */
- if ( (maxnum > minnum+1)
- && (retries < 5) ){
- struct ctdb_public_ip_list *tmp;
- /* mark one of maxnode's vnn's as unassigned and try
- again
- */
- for (tmp=all_ips;tmp;tmp=tmp->next) {
- if (tmp->pnn == maxnode) {
- tmp->pnn = -1;
- retries++;
- goto try_again;
- }
- }
+ /* now, try to make sure the ip adresses are evenly distributed
+ across the node.
+ */
+ if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+ if (lcp2_failback(ctdb, nodemap, mask, all_ips, lcp2_imbalances, newly_healthy)) {
+ goto try_again;
+ }
+ } else {
+ if (basic_failback(ctdb, nodemap, mask, all_ips, num_ips, &retries)) {
+ goto try_again;
}
}
-
/* finished distributing the public addresses, now just send the
info out to the nodes
*/
or -1 if there is no node that can cover this ip
*/
+ return;
+}
+
+/*
+ make any IP alias changes for public addresses that are necessary
+ */
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+ int i;
+ struct ctdb_public_ip ip;
+ struct ctdb_public_ipv4 ipv4;
+ uint32_t *nodes;
+ struct ctdb_public_ip_list *all_ips, *tmp_ip;
+ TDB_DATA data;
+ struct timeval timeout;
+ struct client_async_data *async_data;
+ struct ctdb_client_control_state *state;
+ TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+ /*
+ * ip failover is completely disabled, just send out the
+ * ipreallocated event.
+ */
+ if (ctdb->tunable.disable_ip_failover != 0) {
+ goto ipreallocated;
+ }
+
+ ZERO_STRUCT(ip);
+
+ /* Do the IP reassignment calculations */
+ ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
+
/* now tell all nodes to delete any alias that they should not
have. This will be a NOOP on nodes that don't currently
hold the given alias */
return -1;
}
+ipreallocated:
+ /* tell all nodes to update natwg */
+ /* send the flags update natgw on all connected nodes */
+ data.dptr = discard_const("ipreallocated");
+ data.dsize = strlen((char *)data.dptr) + 1;
+ nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+ if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
+ nodes, 0, TAKEOVER_TIMEOUT(),
+ false, data,
+ NULL, NULL,
+ NULL) != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
+ }
+
talloc_free(tmp_ctx);
return 0;
}
struct ctdb_control_tcp_addr new_addr;
struct ctdb_control_tcp_addr *tcp_sock = NULL;
struct ctdb_tcp_list *tcp;
- struct ctdb_control_tcp_vnn t;
+ struct ctdb_tcp_connection t;
int ret;
TDB_DATA data;
struct ctdb_client_ip *ip;
tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
break;
default:
- DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed to ctdb_control_tcp_client. size was %d but only allowed sizes are %lu and %lu\n", (int)indata.dsize, sizeof(struct ctdb_control_tcp), sizeof(struct ctdb_control_tcp_addr)));
+ DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
+ "to ctdb_control_tcp_client. size was %d but "
+ "only allowed sizes are %lu and %lu\n",
+ (int)indata.dsize,
+ (long unsigned)sizeof(struct ctdb_control_tcp),
+ (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
return -1;
}
DLIST_ADD(client->tcp_list, tcp);
- t.src = tcp_sock->src;
- t.dest = tcp_sock->dest;
+ t.src_addr = tcp_sock->src;
+ t.dst_addr = tcp_sock->dest;
data.dptr = (uint8_t *)&t;
data.dsize = sizeof(t);
return NULL;
}
+
+
/*
called by a daemon to inform us of a TCP connection that one of its
clients managing that should tickled with an ACK when IP takeover is
done
*/
-int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
{
- struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
+ struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
struct ctdb_tcp_array *tcparray;
struct ctdb_tcp_connection tcp;
struct ctdb_vnn *vnn;
- vnn = find_public_ip_vnn(ctdb, &p->dest);
+ vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
if (vnn == NULL) {
DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
- ctdb_addr_to_str(&p->dest)));
+ ctdb_addr_to_str(&p->dst_addr)));
return -1;
}
tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
CTDB_NO_MEMORY(ctdb, tcparray->connections);
- tcparray->connections[tcparray->num].src_addr = p->src;
- tcparray->connections[tcparray->num].dst_addr = p->dest;
+ tcparray->connections[tcparray->num].src_addr = p->src_addr;
+ tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
tcparray->num++;
+
+ if (tcp_update_needed) {
+ vnn->tcp_update_needed = true;
+ }
return 0;
}
/* Do we already have this tickle ?*/
- tcp.src_addr = p->src;
- tcp.dst_addr = p->dest;
+ tcp.src_addr = p->src_addr;
+ tcp.dst_addr = p->dst_addr;
if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
ctdb_addr_to_str(&tcp.dst_addr),
CTDB_NO_MEMORY(ctdb, tcparray->connections);
vnn->tcp_array = tcparray;
- tcparray->connections[tcparray->num].src_addr = p->src;
- tcparray->connections[tcparray->num].dst_addr = p->dest;
+ tcparray->connections[tcparray->num].src_addr = p->src_addr;
+ tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
tcparray->num++;
DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
ntohs(tcp.dst_addr.ip.sin_port),
vnn->pnn));
+ if (tcp_update_needed) {
+ vnn->tcp_update_needed = true;
+ }
+
return 0;
}
}
+/*
+ called by a daemon to inform us of a TCP connection that one of its
+ clients used are no longer needed in the tickle database
+ */
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+ struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
+
+ ctdb_remove_tcp_connection(ctdb, conn);
+
+ return 0;
+}
+
+
/*
called when a daemon restarts - send all tickes for all public addresses
we are serving immediately to the new node.
for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
if (!ctdb_sys_have_ip(&vnn->public_address)) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
continue;
}
- if (vnn->pnn == ctdb->pnn) {
- vnn->pnn = -1;
+ if (!vnn->iface) {
+ continue;
}
- ctdb_event_script(ctdb, "releaseip %s %s %u",
- vnn->iface,
- talloc_strdup(ctdb, ctdb_addr_to_str(&vnn->public_address)),
+ ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&vnn->public_address),
vnn->public_netmask_bits);
release_kill_clients(ctdb, &vnn->public_address);
+ ctdb_vnn_unassign_iface(ctdb, vnn);
}
}
int i, num, len;
struct ctdb_all_public_ips *ips;
struct ctdb_vnn *vnn;
+ bool only_available = false;
+
+ if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
+ only_available = true;
+ }
/* count how many public ip structures we have */
num = 0;
ips = talloc_zero_size(outdata, len);
CTDB_NO_MEMORY(ctdb, ips);
- outdata->dsize = len;
- outdata->dptr = (uint8_t *)ips;
-
- ips->num = num;
i = 0;
for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+ if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
+ continue;
+ }
ips->ips[i].pnn = vnn->pnn;
ips->ips[i].addr = vnn->public_address;
i++;
}
+ ips->num = i;
+ len = offsetof(struct ctdb_all_public_ips, ips) +
+ i*sizeof(struct ctdb_public_ip);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)ips;
return 0;
}
return 0;
}
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata,
+ TDB_DATA *outdata)
+{
+ int i, num, len;
+ ctdb_sock_addr *addr;
+ struct ctdb_control_public_ip_info *info;
+ struct ctdb_vnn *vnn;
+
+ addr = (ctdb_sock_addr *)indata.dptr;
+
+ vnn = find_public_ip_vnn(ctdb, addr);
+ if (vnn == NULL) {
+ /* if it is not a public ip it could be our 'single ip' */
+ if (ctdb->single_ip_vnn) {
+ if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
+ vnn = ctdb->single_ip_vnn;
+ }
+ }
+ }
+ if (vnn == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
+ "'%s'not a public address\n",
+ ctdb_addr_to_str(addr)));
+ return -1;
+ }
+
+ /* count how many public ip structures we have */
+ num = 0;
+ for (;vnn->ifaces[num];) {
+ num++;
+ }
+
+ len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+ num*sizeof(struct ctdb_control_iface_info);
+ info = talloc_zero_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, info);
+
+ info->ip.addr = vnn->public_address;
+ info->ip.pnn = vnn->pnn;
+ info->active_idx = 0xFFFFFFFF;
+
+ for (i=0; vnn->ifaces[i]; i++) {
+ struct ctdb_iface *cur;
+
+ cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+ if (cur == NULL) {
+ DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
+ vnn->ifaces[i]));
+ return -1;
+ }
+ if (vnn->iface == cur) {
+ info->active_idx = i;
+ }
+ strcpy(info->ifaces[i].name, cur->name);
+ info->ifaces[i].link_state = cur->link_up;
+ info->ifaces[i].references = cur->references;
+ }
+ info->num = i;
+ len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+ i*sizeof(struct ctdb_control_iface_info);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)info;
+
+ return 0;
+}
+
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA *outdata)
+{
+ int i, num, len;
+ struct ctdb_control_get_ifaces *ifaces;
+ struct ctdb_iface *cur;
+
+ /* count how many public ip structures we have */
+ num = 0;
+ for (cur=ctdb->ifaces;cur;cur=cur->next) {
+ num++;
+ }
+
+ len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+ num*sizeof(struct ctdb_control_iface_info);
+ ifaces = talloc_zero_size(outdata, len);
+ CTDB_NO_MEMORY(ctdb, ifaces);
+
+ i = 0;
+ for (cur=ctdb->ifaces;cur;cur=cur->next) {
+ strcpy(ifaces->ifaces[i].name, cur->name);
+ ifaces->ifaces[i].link_state = cur->link_up;
+ ifaces->ifaces[i].references = cur->references;
+ i++;
+ }
+ ifaces->num = i;
+ len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+ i*sizeof(struct ctdb_control_iface_info);
+
+ outdata->dsize = len;
+ outdata->dptr = (uint8_t *)ifaces;
+
+ return 0;
+}
+
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+ struct ctdb_req_control *c,
+ TDB_DATA indata)
+{
+ struct ctdb_control_iface_info *info;
+ struct ctdb_iface *iface;
+ bool link_up = false;
+
+ info = (struct ctdb_control_iface_info *)indata.dptr;
+
+ if (info->name[CTDB_IFACE_SIZE] != '\0') {
+ int len = strnlen(info->name, CTDB_IFACE_SIZE);
+ DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
+ len, len, info->name));
+ return -1;
+ }
+
+ switch (info->link_state) {
+ case 0:
+ link_up = false;
+ break;
+ case 1:
+ link_up = true;
+ break;
+ default:
+ DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
+ (unsigned int)info->link_state));
+ return -1;
+ }
+
+ if (info->references != 0) {
+ DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
+ (unsigned int)info->references));
+ return -1;
+ }
+
+ iface = ctdb_find_iface(ctdb, info->name);
+ if (iface == NULL) {
+ return -1;
+ }
+
+ if (link_up == iface->link_up) {
+ return 0;
+ }
+
+ DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
+ ("iface[%s] has changed it's link status %s => %s\n",
+ iface->name,
+ iface->link_up?"up":"down",
+ link_up?"up":"down"));
+
+ iface->link_up = link_up;
+ return 0;
+}
+
/*
structure containing the listening socket and the list of tcp connections
/* have tried too many times, just give up */
if (con->count >= 5) {
- talloc_free(con);
+ /* can't delete in traverse: reparent to delete_cons */
+ talloc_steal(param, con);
return;
}
struct timeval t, void *private_data)
{
struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
-
+ void *delete_cons = talloc_new(NULL);
/* loop over all connections sending tickle ACKs */
- trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
+ trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
+ /* now we've finished traverse, it's safe to do deletion. */
+ talloc_free(delete_cons);
/* If there are no more connections to kill we can remove the
entire killtcp structure
*/
static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
{
+ struct ctdb_vnn *tmpvnn;
+
+ /* verify that this vnn is still active */
+ for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
+ if (tmpvnn == killtcp->vnn) {
+ break;
+ }
+ }
+
+ if (tmpvnn == NULL) {
+ return 0;
+ }
+
+ if (killtcp->vnn->killtcp != killtcp) {
+ return 0;
+ }
+
killtcp->vnn->killtcp = NULL;
+
return 0;
}
a new structure
*/
if (killtcp == NULL) {
- killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
+ killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
CTDB_NO_MEMORY(ctdb, killtcp);
killtcp->vnn = vnn;
If we dont have a socket to listen on yet we must create it
*/
if (killtcp->capture_fd == -1) {
- killtcp->capture_fd = ctdb_sys_open_capture_socket(vnn->iface, &killtcp->private_data);
+ const char *iface = ctdb_vnn_iface_string(vnn);
+ killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
if (killtcp->capture_fd == -1) {
- DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing socket for killtcp\n"));
+ DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
+ "socket on iface '%s' for killtcp (%s)\n",
+ iface, strerror(errno)));
goto failed;
}
}
if (killtcp->fde == NULL) {
killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
- EVENT_FD_READ | EVENT_FD_AUTOCLOSE,
+ EVENT_FD_READ,
capture_tcp_handler, killtcp);
+ tevent_fd_set_auto_close(killtcp->fde);
/* We also need to set up some events to tickle all these connections
until they are all reset
ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
if (ret != 0) {
- DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp failed (%s)\n", strerror(errno)));
+ DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
+ arp->iface, strerror(errno)));
}
int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
{
struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
-
+ int ret;
/* verify the size of indata */
if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
return -1;
}
- return ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
+ ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
+
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
+ return -1;
+ }
+
+ return 0;
}
/*
/* walk over all public addresses until we find a match */
for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
- TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+ TALLOC_CTX *mem_ctx;
DLIST_REMOVE(ctdb->vnn, vnn);
+ if (vnn->pnn != ctdb->pnn) {
+ if (vnn->iface != NULL) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ }
+ talloc_free(vnn);
+ return 0;
+ }
+ vnn->pnn = -1;
+ mem_ctx = talloc_new(ctdb);
+ talloc_steal(mem_ctx, vnn);
ret = ctdb_event_script_callback(ctdb,
- timeval_current_ofs(ctdb->tunable.script_timeout, 0),
mem_ctx, delete_ip_callback, mem_ctx,
- "releaseip %s %s %u",
- vnn->iface,
- talloc_strdup(mem_ctx, ctdb_addr_to_str(&vnn->public_address)),
+ false,
+ CTDB_EVENT_RELEASE_IP,
+ "%s %s %u",
+ ctdb_vnn_iface_string(vnn),
+ ctdb_addr_to_str(&vnn->public_address),
vnn->public_netmask_bits);
- talloc_free(vnn);
+ if (vnn->iface != NULL) {
+ ctdb_vnn_unassign_iface(ctdb, vnn);
+ }
if (ret != 0) {
return -1;
}
return -1;
}
+/* This function is called from the recovery daemon to verify that a remote
+ node has the expected ip allocation.
+ This is verified against ctdb->ip_tree
+*/
+int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
+{
+ struct ctdb_public_ip_list *tmp_ip;
+ int i;
+
+ if (ctdb->ip_tree == NULL) {
+ /* dont know the expected allocation yet, assume remote node
+ is correct. */
+ return 0;
+ }
+
+ if (ips == NULL) {
+ return 0;
+ }
+
+ for (i=0; i<ips->num; i++) {
+ tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
+ if (tmp_ip == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
+ return -1;
+ }
+
+ if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
+ continue;
+ }
+
+ if (tmp_ip->pnn != ips->ips[i].pnn) {
+ DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
+{
+ struct ctdb_public_ip_list *tmp_ip;
+
+ if (ctdb->ip_tree == NULL) {
+ DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
+ return -1;
+ }
+
+ tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
+ if (tmp_ip == NULL) {
+ DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
+ return -1;
+ }
+
+ DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
+ tmp_ip->pnn = ip->pnn;
+
+ return 0;
+}