If any of the nodes fail takeover run (either due to timeout or failure
to complete within takeover_timeout interval) from main loop, recovery
master will give up trying takeover run with following message:
"Unable to setup public takeover addresses. Try again later"
And as a side-effect the monitoring is disabled on all the nodes. Before
ctdb_takeover_run() is called from main loop, monitoring get disabled via
startrecovery event. Since ctdb_takeover_run() fails, it never runs
recovered event and monitoring does not get re-enabled.
In main_loop, ctdb_takeover_run() is called with a takeover_fail_callback.
This callback will get called if any of the nodes fail in handling
takeip/releaseip/ipreallocated events in ctdb_takeover_run().
Signed-off-by: Amitay Isaacs <amitay@gmail.com>
Cherry-pick-from:
cbe68821180e04988edf186dcf6d042edcab81de
Conflicts:
server/ctdb_recoverd.c
const ctdb_sock_addr *src,
uint32_t seq, uint32_t ack, int rst);
+typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
+
int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist);
int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
const char *iface,
int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir);
int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap);
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, client_async_callback fail_callback, void *callback_data);
int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
TDB_DATA indata);
int ctdb_set_child_logging(struct ctdb_context *ctdb);
void ctdb_lockdown_memory(struct ctdb_context *ctdb);
-typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
-
struct client_async_data {
enum ctdb_controls opcode;
bool dont_log_errors;
}
+/*
+ * this callback is called for every node that failed to execute ctdb_takeover_run()
+ * and set flag to re-run takeover run.
+ */
+static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+ struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+ DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
+
+ ctdb_set_culprit(rec, node_pnn);
+ rec->need_takeover_run = true;
+}
+
+
/*
we are the recmaster, and recovery is needed - start a recovery run
*/
return -1;
}
rec->need_takeover_run = false;
- ret = ctdb_takeover_run(ctdb, nodemap);
+ ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
if (ret != 0) {
DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
rec->need_takeover_run = true;
rec->need_takeover_run = true;
}
if (ret == 0) {
- ret = ctdb_takeover_run(ctdb, rec->nodemap);
+ ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
if (ret != 0) {
DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
rec->need_takeover_run = true;
return;
}
- ret = ctdb_takeover_run(ctdb, nodemap);
+ /* If takeover run fails, then the offending nodes are
+ * assigned ban culprit counts. And we re-try takeover.
+ * If takeover run fails repeatedly, the node would get
+ * banned.
+ *
+ * If rec->need_takeover_run is not set to true at this
+ * failure, monitoring is disabled cluster-wide (via
+ * startrecovery eventscript) and will not get enabled.
+ */
+ ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
if (ret != 0) {
- DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
+ DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
return;
}
/*
make any IP alias changes for public addresses that are necessary
*/
-int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+ client_async_callback fail_callback, void *callback_data)
{
int i;
struct ctdb_public_ip ip;
async_data = talloc_zero(tmp_ctx, struct client_async_data);
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+ async_data->fail_callback = fail_callback;
+ async_data->callback_data = callback_data;
+
for (i=0;i<nodemap->num;i++) {
/* don't talk to unconnected nodes, but do talk to banned nodes */
if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
/* tell all nodes to get their own IPs */
async_data = talloc_zero(tmp_ctx, struct client_async_data);
CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+ async_data->fail_callback = fail_callback;
+ async_data->callback_data = callback_data;
+
for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
if (tmp_ip->pnn == -1) {
/* this IP won't be taken over */
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RUN_EVENTSCRIPTS,
nodes, 0, TAKEOVER_TIMEOUT(),
false, data,
- NULL, NULL,
- NULL) != 0) {
+ NULL, fail_callback,
+ callback_data) != 0) {
DEBUG(DEBUG_ERR, (__location__ " ctdb_control to updatenatgw failed\n"));
}