TDB_DATA data;
if (status != 0) {
- struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
-
if (status == -ETIME) {
ctdb_ban_self(ctdb);
}
ctdb_vnn_iface_string(state->vnn)));
ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
- node->flags |= NODE_FLAGS_UNHEALTHY;
talloc_free(state);
return;
}
TALLOC_FREE(ctdb->ip_tree);
ctdb->ip_tree = trbt_create(ctdb, 0);
- for (i=0; i < ctdb->num_nodes; i++) {
-
- if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
- continue;
- }
+ if (ipalloc_state->known_public_ips == NULL) {
+ DEBUG(DEBUG_ERR, ("Known public IPs not set\n"));
+ return NULL;
+ }
- /* there were no public ips for this node */
- if (ipalloc_state->known_public_ips == NULL) {
- continue;
- }
+ for (i=0; i < ipalloc_state->num; i++) {
public_ips = &ipalloc_state->known_public_ips[i];
CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
/* Do not use information about IP addresses hosted
* on other nodes, it may not be accurate */
- if (public_ips->ip[j].pnn == ctdb->nodes[i]->pnn) {
+ if (public_ips->ip[j].pnn == i) {
tmp_ip->pnn = public_ips->ip[j].pnn;
} else {
tmp_ip->pnn = -1;
}
struct takeover_callback_data {
- bool *node_failed;
- client_async_callback fail_callback;
- void *fail_callback_data;
- struct ctdb_node_map_old *nodemap;
+ uint32_t num_nodes;
+ unsigned int *fail_count;
};
+static struct takeover_callback_data *
+takeover_callback_data_init(TALLOC_CTX *mem_ctx,
+ uint32_t num_nodes)
+{
+ static struct takeover_callback_data *takeover_data;
+
+ takeover_data = talloc_zero(mem_ctx, struct takeover_callback_data);
+ if (takeover_data == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ return NULL;
+ }
+
+ takeover_data->fail_count = talloc_zero_array(takeover_data,
+ unsigned int, num_nodes);
+ if (takeover_data->fail_count == NULL) {
+ DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+ talloc_free(takeover_data);
+ return NULL;
+ }
+
+ takeover_data->num_nodes = num_nodes;
+
+ return takeover_data;
+}
+
static void takeover_run_fail_callback(struct ctdb_context *ctdb,
uint32_t node_pnn, int32_t res,
TDB_DATA outdata, void *callback_data)
struct takeover_callback_data *cd =
talloc_get_type_abort(callback_data,
struct takeover_callback_data);
- int i;
- for (i = 0; i < cd->nodemap->num; i++) {
- if (node_pnn == cd->nodemap->nodes[i].pnn) {
- break;
- }
- }
-
- if (i == cd->nodemap->num) {
+ if (node_pnn >= cd->num_nodes) {
DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
return;
}
- if (!cd->node_failed[i]) {
- cd->node_failed[i] = true;
- cd->fail_callback(ctdb, node_pnn, res, outdata,
- cd->fail_callback_data);
+ if (cd->fail_count[node_pnn] == 0) {
+ DEBUG(DEBUG_ERR,
+ ("Node %u failed the takeover run\n", node_pnn));
+ }
+
+ cd->fail_count[node_pnn]++;
+}
+
+static void takeover_run_process_failures(struct ctdb_context *ctdb,
+ struct takeover_callback_data *tcd)
+{
+ unsigned int max_fails = 0;
+ uint32_t max_pnn = -1;
+ uint32_t i;
+
+ for (i = 0; i < tcd->num_nodes; i++) {
+ if (tcd->fail_count[i] > max_fails) {
+ max_pnn = i;
+ max_fails = tcd->fail_count[i];
+ }
+ }
+
+ if (max_fails > 0) {
+ int ret;
+ TDB_DATA data;
+
+ DEBUG(DEBUG_ERR,
+ ("Sending banning credits to %u with fail count %u\n",
+ max_pnn, max_fails));
+
+ data.dptr = (uint8_t *)&max_pnn;
+ data.dsize = sizeof(uint32_t);
+ ret = ctdb_client_send_message(ctdb,
+ CTDB_BROADCAST_CONNECTED,
+ CTDB_SRVID_BANNING,
+ data);
+ if (ret != 0) {
+ DEBUG(DEBUG_ERR,
+ ("Failed to set banning credits for node %u\n",
+ max_pnn));
+ }
}
}
* - Send IPREALLOCATED to all nodes (with backward compatibility hack)
*/
int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
- uint32_t *force_rebalance_nodes,
- client_async_callback fail_callback, void *callback_data)
+ uint32_t *force_rebalance_nodes)
{
int i, ret;
struct ctdb_public_ip ip;
struct takeover_callback_data *takeover_data;
bool can_host_ips;
+ /* Initialise fail callback data to be used with
+ * takeover_run_fail_callback(). A failure in any of the
+ * following steps will cause an early return, so this can be
+ * reused for each of those steps without re-initialising. */
+ takeover_data = takeover_callback_data_init(tmp_ctx,
+ nodemap->num);
+ if (takeover_data == NULL) {
+ talloc_free(tmp_ctx);
+ return -1;
+ }
+
/*
* ip failover is completely disabled, just send out the
* ipreallocated event.
* host. This will be a NOOP on nodes that don't currently
* hold the given IP.
*/
- takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
- CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
-
- takeover_data->node_failed = talloc_zero_array(tmp_ctx,
- bool, nodemap->num);
- CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
- takeover_data->fail_callback = fail_callback;
- takeover_data->fail_callback_data = callback_data;
- takeover_data->nodemap = nodemap;
-
async_data = talloc_zero(tmp_ctx, struct client_async_data);
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
}
}
if (ctdb_client_async_wait(ctdb, async_data) != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
- talloc_free(tmp_ctx);
- return -1;
+ DEBUG(DEBUG_ERR,
+ ("Async control CTDB_CONTROL_RELEASE_IP failed\n"));
+ goto fail;
}
talloc_free(async_data);
async_data = talloc_zero(tmp_ctx, struct client_async_data);
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
- async_data->fail_callback = fail_callback;
- async_data->callback_data = callback_data;
+ async_data->fail_callback = takeover_run_fail_callback;
+ async_data->callback_data = takeover_data;
for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
if (tmp_ip->pnn == -1) {
ctdb_client_async_add(async_data, state);
}
if (ctdb_client_async_wait(ctdb, async_data) != 0) {
- DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
- talloc_free(tmp_ctx);
- return -1;
+ DEBUG(DEBUG_ERR,
+ ("Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
+ goto fail;
}
ipreallocated:
ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
nodes, 0, TAKEOVER_TIMEOUT(),
false, tdb_null,
- NULL, fail_callback,
- &callback_data);
+ NULL, takeover_run_fail_callback,
+ takeover_data);
if (ret != 0) {
DEBUG(DEBUG_ERR,
("Async CTDB_CONTROL_IPREALLOCATED control failed\n"));
+ goto fail;
}
talloc_free(tmp_ctx);
return ret;
+
+fail:
+ takeover_run_process_failures(ctdb, takeover_data);
+ talloc_free(tmp_ctx);
+ return -1;
}