From 4ff998bf33cfa6d1222f7c8f8eba3aeec0f36e6e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 3 Dec 2010 13:28:35 +1100 Subject: [PATCH] during ip allocation, there are failure modes where a node might hold a ip address but thinks it is still unassigned (-1). add code to the recovery daemon to detect this case and trigger a reallocation so that the ip gets covered and change the takeip code to allow for this condition, taking on an ip address that is already hosted. cq s1021073 --- server/ctdb_recoverd.c | 14 ++++++++++---- server/ctdb_takeover.c | 9 +++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index c6bf6580..9caa5024 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -2495,7 +2495,7 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct /* called to check that the local allocation of public ip addresses is ok. */ -static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn) +static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap) { TALLOC_CTX *mem_ctx = talloc_new(NULL); struct ctdb_control_get_ifaces *ifaces = NULL; @@ -2586,11 +2586,17 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec and we dont have ones we shouldnt have. if we find an inconsistency we set recmode to active on the local node and wait for the recmaster - to do a full blown recovery + to do a full blown recovery. + also if the pnn is -1 and we are healthy and can host the ip + we also request a ip reallocation. */ if (ctdb->tunable.disable_ip_failover == 0) { for (j=0; jnum; j++) { - if (ips->ips[j].pnn == pnn) { + if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) { + DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n", + ctdb_addr_to_str(&ips->ips[j].addr))); + need_takeover_run = true; + } else if (ips->ips[j].pnn == pnn) { if (!ctdb_sys_have_ip(&ips->ips[j].addr)) { DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", ctdb_addr_to_str(&ips->ips[j].addr))); @@ -3122,7 +3128,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, */ if (ctdb->tunable.disable_ip_failover == 0) { if (rec->ip_check_disable_ctx == NULL) { - if (verify_local_ip_allocation(ctdb, rec, pnn) != 0) { + if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) { DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n")); } } diff --git a/server/ctdb_takeover.c b/server/ctdb_takeover.c index d4958079..682d17ba 100644 --- a/server/ctdb_takeover.c +++ b/server/ctdb_takeover.c @@ -611,6 +611,15 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, return -1; } + if (vnn->pnn == -1 && have_ip) { + vnn->pnn = ctdb->pnn; + DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, " + "and we already have it on iface[%s], update local daemon\n", + ctdb_addr_to_str(&vnn->public_address), + ctdb_vnn_iface_string(vnn))); + return 0; + } + if (vnn->iface) { if (vnn->iface->link_up) { /* only move when the rebalance gains something */ -- 2.34.1