Found during automatic regression testing.
We do not allow the takeip/releaseip events to be executed during a recovery.
All of "ctdb addip, ctdb delip, ctdb moveip" use and force these events to
trigger to perform the ip assignments required.
If these commands collide with a recovery, these commands could fail since we do
not allow takeip/releaseip events to trigger during the recovery.
While it is easy to just try running hte command again, this is suboptimal for script use.
Change these commands to retry these operations a few times until either successfull or until we give up.
This makes the commands much easier to use in scripts.
static int control_moveip(struct ctdb_context *ctdb, int argc, const char **argv)
{
uint32_t pnn;
static int control_moveip(struct ctdb_context *ctdb, int argc, const char **argv)
{
uint32_t pnn;
ctdb_sock_addr addr;
if (argc < 2) {
ctdb_sock_addr addr;
if (argc < 2) {
- if (move_ip(ctdb, &addr, pnn) != 0) {
- DEBUG(DEBUG_ERR,("Failed to move ip to node %d\n", pnn));
+ do {
+ ret = move_ip(ctdb, &addr, pnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. Wait 3 second and try again.\n", pnn));
+ sleep(3);
+ retries++;
+ }
+ } while (retries < 5 && ret != 0);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. Giving up.\n", pnn));
static int control_addip(struct ctdb_context *ctdb, int argc, const char **argv)
{
int i, ret;
static int control_addip(struct ctdb_context *ctdb, int argc, const char **argv)
{
int i, ret;
uint32_t pnn;
unsigned mask;
ctdb_sock_addr addr;
uint32_t pnn;
unsigned mask;
ctdb_sock_addr addr;
pub->len = strlen(argv[1])+1;
memcpy(&pub->iface[0], argv[1], strlen(argv[1])+1);
pub->len = strlen(argv[1])+1;
memcpy(&pub->iface[0], argv[1], strlen(argv[1])+1);
- ret = ctdb_ctrl_add_public_ip(ctdb, TIMELIMIT(), options.pnn, pub);
+ do {
+ ret = ctdb_ctrl_add_public_ip(ctdb, TIMELIMIT(), options.pnn, pub);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("Unable to add public ip to node %u. Wait 3 seconds and try again.\n", options.pnn));
+ sleep(3);
+ retries++;
+ }
+ } while (retries < 5 && ret != 0);
- DEBUG(DEBUG_ERR, ("Unable to add public ip to node %u\n", options.pnn));
+ DEBUG(DEBUG_ERR, ("Unable to add public ip to node %u. Giving up.\n", options.pnn));
talloc_free(tmp_ctx);
return ret;
}
talloc_free(tmp_ctx);
return ret;
}
- if (move_ip(ctdb, &addr, pnn) != 0) {
- DEBUG(DEBUG_ERR,("Failed to move ip to node %d\n", pnn));
- return -1;
+ do {
+ ret = move_ip(ctdb, &addr, pnn);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. wait 3 seconds and try again.\n", pnn));
+ sleep(3);
+ retries++;
+ }
+ } while (retries < 5 && ret != 0);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. Giving up.\n", pnn));
+ talloc_free(tmp_ctx);
+ return ret;
- ret = control_ipreallocate(ctdb, argc, argv);
+ do {
+ ret = control_ipreallocate(ctdb, argc, argv);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u. Wait 3 seconds and try again.\n", options.pnn));
+ sleep(3);
+ retries++;
+ }
+ } while (retries < 5 && ret != 0);
- DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u\n", options.pnn));
+ DEBUG(DEBUG_ERR, ("IP Reallocate failed on node %u. Giving up.\n", options.pnn));
+ talloc_free(tmp_ctx);
static int control_delip(struct ctdb_context *ctdb, int argc, const char **argv)
{
int i, ret;
static int control_delip(struct ctdb_context *ctdb, int argc, const char **argv)
{
int i, ret;
ctdb_sock_addr addr;
struct ctdb_control_ip_iface pub;
TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
ctdb_sock_addr addr;
struct ctdb_control_ip_iface pub;
TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
if (ips->ips[i].pnn == options.pnn) {
ret = find_other_host_for_public_ip(ctdb, &addr);
if (ret != -1) {
if (ips->ips[i].pnn == options.pnn) {
ret = find_other_host_for_public_ip(ctdb, &addr);
if (ret != -1) {
- if (move_ip(ctdb, &addr, ret) != 0) {
- DEBUG(DEBUG_ERR,("Failed to move ip to node %d\n", ret));
+ do {
+ ret = move_ip(ctdb, &addr, ret);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. Wait 3 seconds and try again.\n", options.pnn));
+ sleep(3);
+ retries++;
+ }
+ } while (retries < 5 && ret != 0);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,("Failed to move ip to node %d. Giving up.\n", options.pnn));