4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* most recent reload all ips request we need to perform during the
36 struct reloadips_all_reply *reload_all_ips_request = NULL;
38 /* list of "ctdb ipreallocate" processes to call back when we have
39 finished the takeover run.
41 struct ip_reallocate_list {
42 struct ip_reallocate_list *next;
43 struct rd_memdump_reply *rd;
46 struct ctdb_banning_state {
48 struct timeval last_reported_time;
52 private state of recovery daemon
54 struct ctdb_recoverd {
55 struct ctdb_context *ctdb;
58 uint32_t num_connected;
59 uint32_t last_culprit_node;
60 struct ctdb_node_map *nodemap;
61 struct timeval priority_time;
62 bool need_takeover_run;
65 struct timed_event *send_election_te;
66 struct timed_event *election_timeout;
67 struct vacuum_info *vacuum_info;
68 TALLOC_CTX *ip_reallocate_ctx;
69 struct ip_reallocate_list *reallocate_callers;
70 TALLOC_CTX *ip_check_disable_ctx;
71 struct ctdb_control_get_ifaces *ifaces;
72 TALLOC_CTX *deferred_rebalance_ctx;
75 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
76 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
78 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
81 ban a node for a period of time
83 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
86 struct ctdb_context *ctdb = rec->ctdb;
87 struct ctdb_ban_time bantime;
89 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
91 if (!ctdb_validate_pnn(ctdb, pnn)) {
92 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
97 bantime.time = ban_time;
99 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
101 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
107 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
111 remember the trouble maker
113 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
115 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
116 struct ctdb_banning_state *ban_state;
118 if (culprit > ctdb->num_nodes) {
119 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
123 if (ctdb->nodes[culprit]->ban_state == NULL) {
124 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
125 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
129 ban_state = ctdb->nodes[culprit]->ban_state;
130 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
131 /* this was the first time in a long while this node
132 misbehaved so we will forgive any old transgressions.
134 ban_state->count = 0;
137 ban_state->count += count;
138 ban_state->last_reported_time = timeval_current();
139 rec->last_culprit_node = culprit;
143 remember the trouble maker
145 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
147 ctdb_set_culprit_count(rec, culprit, 1);
151 /* this callback is called for every node that failed to execute the
154 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
156 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
158 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
160 ctdb_set_culprit(rec, node_pnn);
164 run the "recovered" eventscript on all nodes
166 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
170 struct ctdb_context *ctdb = rec->ctdb;
172 tmp_ctx = talloc_new(ctdb);
173 CTDB_NO_MEMORY(ctdb, tmp_ctx);
175 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
176 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
178 CONTROL_TIMEOUT(), false, tdb_null,
179 NULL, recovered_fail_callback,
181 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
183 talloc_free(tmp_ctx);
187 talloc_free(tmp_ctx);
191 /* this callback is called for every node that failed to execute the
194 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
196 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
198 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
200 ctdb_set_culprit(rec, node_pnn);
204 run the "startrecovery" eventscript on all nodes
206 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
210 struct ctdb_context *ctdb = rec->ctdb;
212 tmp_ctx = talloc_new(ctdb);
213 CTDB_NO_MEMORY(ctdb, tmp_ctx);
215 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
216 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
218 CONTROL_TIMEOUT(), false, tdb_null,
220 startrecovery_fail_callback,
222 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
223 talloc_free(tmp_ctx);
227 talloc_free(tmp_ctx);
231 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
233 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
234 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
237 if (node_pnn < ctdb->num_nodes) {
238 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
241 if (node_pnn == ctdb->pnn) {
242 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
247 update the node capabilities for all connected nodes
249 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
254 tmp_ctx = talloc_new(ctdb);
255 CTDB_NO_MEMORY(ctdb, tmp_ctx);
257 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
258 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
262 async_getcap_callback, NULL,
264 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
265 talloc_free(tmp_ctx);
269 talloc_free(tmp_ctx);
273 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
275 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
277 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
278 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
281 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
283 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
285 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
286 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
290 change recovery mode on all nodes
292 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
298 tmp_ctx = talloc_new(ctdb);
299 CTDB_NO_MEMORY(ctdb, tmp_ctx);
301 /* freeze all nodes */
302 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
303 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
306 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
307 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
312 set_recmode_fail_callback,
314 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
315 talloc_free(tmp_ctx);
322 data.dsize = sizeof(uint32_t);
323 data.dptr = (unsigned char *)&rec_mode;
325 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
331 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
332 talloc_free(tmp_ctx);
336 talloc_free(tmp_ctx);
341 change recovery master on all node
343 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
349 tmp_ctx = talloc_new(ctdb);
350 CTDB_NO_MEMORY(ctdb, tmp_ctx);
352 data.dsize = sizeof(uint32_t);
353 data.dptr = (unsigned char *)&pnn;
355 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
356 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
358 CONTROL_TIMEOUT(), false, data,
361 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
362 talloc_free(tmp_ctx);
366 talloc_free(tmp_ctx);
370 /* update all remote nodes to use the same db priority that we have
371 this can fail if the remove node has not yet been upgraded to
372 support this function, so we always return success and never fail
373 a recovery if this call fails.
375 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
376 struct ctdb_node_map *nodemap,
377 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
382 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
384 /* step through all local databases */
385 for (db=0; db<dbmap->num;db++) {
387 struct ctdb_db_priority db_prio;
390 db_prio.db_id = dbmap->dbs[db].dbid;
391 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
393 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
397 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
399 data.dptr = (uint8_t *)&db_prio;
400 data.dsize = sizeof(db_prio);
402 if (ctdb_client_async_control(ctdb,
403 CTDB_CONTROL_SET_DB_PRIORITY,
405 CONTROL_TIMEOUT(), false, data,
408 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
416 ensure all other nodes have attached to any databases that we have
418 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
419 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
422 struct ctdb_dbid_map *remote_dbmap;
424 /* verify that all other nodes have all our databases */
425 for (j=0; j<nodemap->num; j++) {
426 /* we dont need to ourself ourselves */
427 if (nodemap->nodes[j].pnn == pnn) {
430 /* dont check nodes that are unavailable */
431 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
435 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
436 mem_ctx, &remote_dbmap);
438 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
442 /* step through all local databases */
443 for (db=0; db<dbmap->num;db++) {
447 for (i=0;i<remote_dbmap->num;i++) {
448 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
452 /* the remote node already have this database */
453 if (i!=remote_dbmap->num) {
456 /* ok so we need to create this database */
457 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
460 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
463 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
465 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
467 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
478 ensure we are attached to any databases that anyone else is attached to
480 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
481 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
484 struct ctdb_dbid_map *remote_dbmap;
486 /* verify that we have all database any other node has */
487 for (j=0; j<nodemap->num; j++) {
488 /* we dont need to ourself ourselves */
489 if (nodemap->nodes[j].pnn == pnn) {
492 /* dont check nodes that are unavailable */
493 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
497 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
498 mem_ctx, &remote_dbmap);
500 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
504 /* step through all databases on the remote node */
505 for (db=0; db<remote_dbmap->num;db++) {
508 for (i=0;i<(*dbmap)->num;i++) {
509 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
513 /* we already have this db locally */
514 if (i!=(*dbmap)->num) {
517 /* ok so we need to create this database and
520 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
521 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
523 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
524 nodemap->nodes[j].pnn));
527 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
528 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
530 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
533 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
535 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
546 pull the remote database contents from one node into the recdb
548 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
549 struct tdb_wrap *recdb, uint32_t dbid)
553 struct ctdb_marshall_buffer *reply;
554 struct ctdb_rec_data *rec;
556 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
558 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
559 CONTROL_TIMEOUT(), &outdata);
561 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
562 talloc_free(tmp_ctx);
566 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
568 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
569 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
570 talloc_free(tmp_ctx);
574 rec = (struct ctdb_rec_data *)&reply->data[0];
578 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
580 struct ctdb_ltdb_header *hdr;
583 key.dptr = &rec->data[0];
584 key.dsize = rec->keylen;
585 data.dptr = &rec->data[key.dsize];
586 data.dsize = rec->datalen;
588 hdr = (struct ctdb_ltdb_header *)data.dptr;
590 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
591 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
592 talloc_free(tmp_ctx);
596 /* fetch the existing record, if any */
597 existing = tdb_fetch(recdb->tdb, key);
599 if (existing.dptr != NULL) {
600 struct ctdb_ltdb_header header;
601 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
602 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
603 (unsigned)existing.dsize, srcnode));
605 talloc_free(tmp_ctx);
608 header = *(struct ctdb_ltdb_header *)existing.dptr;
610 if (!(header.rsn < hdr->rsn ||
611 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
616 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
617 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
618 talloc_free(tmp_ctx);
623 talloc_free(tmp_ctx);
629 struct pull_seqnum_cbdata {
635 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
637 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
640 if (cb_data->failed != 0) {
641 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
646 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
651 if (outdata.dsize != sizeof(uint64_t)) {
652 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
653 cb_data->failed = -1;
657 seqnum = *((uint64_t *)outdata.dptr);
659 if (seqnum > cb_data->seqnum) {
660 cb_data->seqnum = seqnum;
661 cb_data->pnn = node_pnn;
665 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
667 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
669 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
673 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
674 struct ctdb_recoverd *rec,
675 struct ctdb_node_map *nodemap,
676 struct tdb_wrap *recdb, uint32_t dbid)
678 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
682 struct pull_seqnum_cbdata *cb_data;
684 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
689 data.dsize = sizeof(outdata);
690 data.dptr = (uint8_t *)&outdata[0];
692 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
693 if (cb_data == NULL) {
694 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
695 talloc_free(tmp_ctx);
703 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
704 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
706 CONTROL_TIMEOUT(), false, data,
710 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
712 talloc_free(tmp_ctx);
716 if (cb_data->failed != 0) {
717 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
718 talloc_free(tmp_ctx);
722 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
723 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
724 talloc_free(tmp_ctx);
728 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
730 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
731 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
732 talloc_free(tmp_ctx);
736 talloc_free(tmp_ctx);
742 pull all the remote database contents into the recdb
744 static int pull_remote_database(struct ctdb_context *ctdb,
745 struct ctdb_recoverd *rec,
746 struct ctdb_node_map *nodemap,
747 struct tdb_wrap *recdb, uint32_t dbid,
752 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
754 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
760 /* pull all records from all other nodes across onto this node
761 (this merges based on rsn)
763 for (j=0; j<nodemap->num; j++) {
764 /* dont merge from nodes that are unavailable */
765 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
768 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
769 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
770 nodemap->nodes[j].pnn));
771 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
781 update flags on all active nodes
783 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
787 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
789 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
797 ensure all nodes have the same vnnmap we do
799 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
800 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
804 /* push the new vnn map out to all the nodes */
805 for (j=0; j<nodemap->num; j++) {
806 /* dont push to nodes that are unavailable */
807 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
811 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
813 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
823 struct vacuum_info *next, *prev;
824 struct ctdb_recoverd *rec;
826 struct ctdb_db_context *ctdb_db;
827 struct ctdb_marshall_buffer *recs;
828 struct ctdb_rec_data *r;
831 static void vacuum_fetch_next(struct vacuum_info *v);
834 called when a vacuum fetch has completed - just free it and do the next one
836 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
838 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
840 vacuum_fetch_next(v);
845 process the next element from the vacuum list
847 static void vacuum_fetch_next(struct vacuum_info *v)
849 struct ctdb_call call;
850 struct ctdb_rec_data *r;
852 while (v->recs->count) {
853 struct ctdb_client_call_state *state;
855 struct ctdb_ltdb_header *hdr;
858 call.call_id = CTDB_NULL_FUNC;
859 call.flags = CTDB_IMMEDIATE_MIGRATION;
860 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
863 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
866 call.key.dptr = &r->data[0];
867 call.key.dsize = r->keylen;
869 /* ensure we don't block this daemon - just skip a record if we can't get
871 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
875 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
876 if (data.dptr == NULL) {
877 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
881 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
883 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
887 hdr = (struct ctdb_ltdb_header *)data.dptr;
888 if (hdr->dmaster == v->rec->ctdb->pnn) {
889 /* its already local */
891 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
897 state = ctdb_call_send(v->ctdb_db, &call);
898 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
900 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
904 state->async.fn = vacuum_fetch_callback;
905 state->async.private_data = v;
914 destroy a vacuum info structure
916 static int vacuum_info_destructor(struct vacuum_info *v)
918 DLIST_REMOVE(v->rec->vacuum_info, v);
924 handler for vacuum fetch
926 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
927 TDB_DATA data, void *private_data)
929 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
930 struct ctdb_marshall_buffer *recs;
932 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
934 struct ctdb_dbid_map *dbmap=NULL;
935 bool persistent = false;
936 struct ctdb_db_context *ctdb_db;
937 struct ctdb_rec_data *r;
939 struct vacuum_info *v;
941 recs = (struct ctdb_marshall_buffer *)data.dptr;
942 r = (struct ctdb_rec_data *)&recs->data[0];
944 if (recs->count == 0) {
945 talloc_free(tmp_ctx);
951 for (v=rec->vacuum_info;v;v=v->next) {
952 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
953 /* we're already working on records from this node */
954 talloc_free(tmp_ctx);
959 /* work out if the database is persistent */
960 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
962 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
963 talloc_free(tmp_ctx);
967 for (i=0;i<dbmap->num;i++) {
968 if (dbmap->dbs[i].dbid == recs->db_id) {
969 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
973 if (i == dbmap->num) {
974 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
975 talloc_free(tmp_ctx);
979 /* find the name of this database */
980 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
981 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
982 talloc_free(tmp_ctx);
987 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
988 if (ctdb_db == NULL) {
989 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
990 talloc_free(tmp_ctx);
994 v = talloc_zero(rec, struct vacuum_info);
996 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
997 talloc_free(tmp_ctx);
1002 v->srcnode = srcnode;
1003 v->ctdb_db = ctdb_db;
1004 v->recs = talloc_memdup(v, recs, data.dsize);
1005 if (v->recs == NULL) {
1006 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1008 talloc_free(tmp_ctx);
1011 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1013 DLIST_ADD(rec->vacuum_info, v);
1015 talloc_set_destructor(v, vacuum_info_destructor);
1017 vacuum_fetch_next(v);
1018 talloc_free(tmp_ctx);
1023 called when ctdb_wait_timeout should finish
1025 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1026 struct timeval yt, void *p)
1028 uint32_t *timed_out = (uint32_t *)p;
1033 wait for a given number of seconds
1035 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1037 uint32_t timed_out = 0;
1038 time_t usecs = (secs - (time_t)secs) * 1000000;
1039 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1040 while (!timed_out) {
1041 event_loop_once(ctdb->ev);
1046 called when an election times out (ends)
1048 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1049 struct timeval t, void *p)
1051 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1052 rec->election_timeout = NULL;
1055 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1060 wait for an election to finish. It finished election_timeout seconds after
1061 the last election packet is received
1063 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1065 struct ctdb_context *ctdb = rec->ctdb;
1066 while (rec->election_timeout) {
1067 event_loop_once(ctdb->ev);
1072 Update our local flags from all remote connected nodes.
1073 This is only run when we are or we belive we are the recovery master
1075 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1078 struct ctdb_context *ctdb = rec->ctdb;
1079 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1081 /* get the nodemap for all active remote nodes and verify
1082 they are the same as for this node
1084 for (j=0; j<nodemap->num; j++) {
1085 struct ctdb_node_map *remote_nodemap=NULL;
1088 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1091 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1095 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1096 mem_ctx, &remote_nodemap);
1098 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1099 nodemap->nodes[j].pnn));
1100 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1101 talloc_free(mem_ctx);
1102 return MONITOR_FAILED;
1104 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1105 /* We should tell our daemon about this so it
1106 updates its flags or else we will log the same
1107 message again in the next iteration of recovery.
1108 Since we are the recovery master we can just as
1109 well update the flags on all nodes.
1111 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1113 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1117 /* Update our local copy of the flags in the recovery
1120 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1121 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1122 nodemap->nodes[j].flags));
1123 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1125 talloc_free(remote_nodemap);
1127 talloc_free(mem_ctx);
1132 /* Create a new random generation ip.
1133 The generation id can not be the INVALID_GENERATION id
1135 static uint32_t new_generation(void)
1137 uint32_t generation;
1140 generation = random();
1142 if (generation != INVALID_GENERATION) {
1152 create a temporary working database
1154 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1157 struct tdb_wrap *recdb;
1160 /* open up the temporary recovery database */
1161 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1162 ctdb->db_directory_state,
1169 tdb_flags = TDB_NOLOCK;
1170 if (ctdb->valgrinding) {
1171 tdb_flags |= TDB_NOMMAP;
1173 tdb_flags |= TDB_DISALLOW_NESTING;
1175 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1176 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1177 if (recdb == NULL) {
1178 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1188 a traverse function for pulling all relevant records from recdb
1191 struct ctdb_context *ctdb;
1192 struct ctdb_marshall_buffer *recdata;
1194 uint32_t allocated_len;
1199 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1201 struct recdb_data *params = (struct recdb_data *)p;
1202 struct ctdb_rec_data *rec;
1203 struct ctdb_ltdb_header *hdr;
1206 * skip empty records - but NOT for persistent databases:
1208 * The record-by-record mode of recovery deletes empty records.
1209 * For persistent databases, this can lead to data corruption
1210 * by deleting records that should be there:
1212 * - Assume the cluster has been running for a while.
1214 * - A record R in a persistent database has been created and
1215 * deleted a couple of times, the last operation being deletion,
1216 * leaving an empty record with a high RSN, say 10.
1218 * - Now a node N is turned off.
1220 * - This leaves the local database copy of D on N with the empty
1221 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1222 * the copy of record R.
1224 * - Now the record is created again while node N is turned off.
1225 * This creates R with RSN = 1 on all nodes except for N.
1227 * - Now node N is turned on again. The following recovery will chose
1228 * the older empty copy of R due to RSN 10 > RSN 1.
1230 * ==> Hence the record is gone after the recovery.
1232 * On databases like Samba's registry, this can damage the higher-level
1233 * data structures built from the various tdb-level records.
1235 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1239 /* update the dmaster field to point to us */
1240 hdr = (struct ctdb_ltdb_header *)data.dptr;
1241 if (!params->persistent) {
1242 hdr->dmaster = params->ctdb->pnn;
1243 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1246 /* add the record to the blob ready to send to the nodes */
1247 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1249 params->failed = true;
1252 if (params->len + rec->length >= params->allocated_len) {
1253 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1254 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1256 if (params->recdata == NULL) {
1257 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1258 rec->length + params->len, params->recdata->count));
1259 params->failed = true;
1262 params->recdata->count++;
1263 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1264 params->len += rec->length;
1271 push the recdb database out to all nodes
1273 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1275 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1277 struct recdb_data params;
1278 struct ctdb_marshall_buffer *recdata;
1280 TALLOC_CTX *tmp_ctx;
1283 tmp_ctx = talloc_new(ctdb);
1284 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1286 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1287 CTDB_NO_MEMORY(ctdb, recdata);
1289 recdata->db_id = dbid;
1292 params.recdata = recdata;
1293 params.len = offsetof(struct ctdb_marshall_buffer, data);
1294 params.allocated_len = params.len;
1295 params.failed = false;
1296 params.persistent = persistent;
1298 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1299 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1300 talloc_free(params.recdata);
1301 talloc_free(tmp_ctx);
1305 if (params.failed) {
1306 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1307 talloc_free(params.recdata);
1308 talloc_free(tmp_ctx);
1312 recdata = params.recdata;
1314 outdata.dptr = (void *)recdata;
1315 outdata.dsize = params.len;
1317 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1318 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1320 CONTROL_TIMEOUT(), false, outdata,
1323 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1324 talloc_free(recdata);
1325 talloc_free(tmp_ctx);
1329 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1330 dbid, recdata->count));
1332 talloc_free(recdata);
1333 talloc_free(tmp_ctx);
1340 go through a full recovery on one database
1342 static int recover_database(struct ctdb_recoverd *rec,
1343 TALLOC_CTX *mem_ctx,
1347 struct ctdb_node_map *nodemap,
1348 uint32_t transaction_id)
1350 struct tdb_wrap *recdb;
1352 struct ctdb_context *ctdb = rec->ctdb;
1354 struct ctdb_control_wipe_database w;
1357 recdb = create_recdb(ctdb, mem_ctx);
1358 if (recdb == NULL) {
1362 /* pull all remote databases onto the recdb */
1363 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1365 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1369 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1371 /* wipe all the remote databases. This is safe as we are in a transaction */
1373 w.transaction_id = transaction_id;
1375 data.dptr = (void *)&w;
1376 data.dsize = sizeof(w);
1378 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1379 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1381 CONTROL_TIMEOUT(), false, data,
1384 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1389 /* push out the correct database. This sets the dmaster and skips
1390 the empty records */
1391 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1397 /* all done with this database */
1404 reload the nodes file
1406 static void reload_nodes_file(struct ctdb_context *ctdb)
1409 ctdb_load_nodes_file(ctdb);
1412 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1413 struct ctdb_recoverd *rec,
1414 struct ctdb_node_map *nodemap,
1420 if (ctdb->num_nodes != nodemap->num) {
1421 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1422 ctdb->num_nodes, nodemap->num));
1424 *culprit = ctdb->pnn;
1429 for (j=0; j<nodemap->num; j++) {
1430 /* release any existing data */
1431 if (ctdb->nodes[j]->known_public_ips) {
1432 talloc_free(ctdb->nodes[j]->known_public_ips);
1433 ctdb->nodes[j]->known_public_ips = NULL;
1435 if (ctdb->nodes[j]->available_public_ips) {
1436 talloc_free(ctdb->nodes[j]->available_public_ips);
1437 ctdb->nodes[j]->available_public_ips = NULL;
1440 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1444 /* grab a new shiny list of public ips from the node */
1445 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1447 ctdb->nodes[j]->pnn,
1450 &ctdb->nodes[j]->known_public_ips);
1452 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1453 ctdb->nodes[j]->pnn));
1455 *culprit = ctdb->nodes[j]->pnn;
1460 if (ctdb->do_checkpublicip) {
1461 if (rec->ip_check_disable_ctx == NULL) {
1462 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1463 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1464 rec->need_takeover_run = true;
1469 /* grab a new shiny list of public ips from the node */
1470 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1472 ctdb->nodes[j]->pnn,
1474 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1475 &ctdb->nodes[j]->available_public_ips);
1477 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1478 ctdb->nodes[j]->pnn));
1480 *culprit = ctdb->nodes[j]->pnn;
1489 /* when we start a recovery, make sure all nodes use the same reclock file
1492 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1494 struct ctdb_context *ctdb = rec->ctdb;
1495 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1499 if (ctdb->recovery_lock_file == NULL) {
1503 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1504 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1507 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1508 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1514 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1515 talloc_free(tmp_ctx);
1519 talloc_free(tmp_ctx);
1525 * this callback is called for every node that failed to execute ctdb_takeover_run()
1526 * and set flag to re-run takeover run.
1528 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1530 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1532 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
1534 ctdb_set_culprit(rec, node_pnn);
1535 rec->need_takeover_run = true;
1540 we are the recmaster, and recovery is needed - start a recovery run
1542 static int do_recovery(struct ctdb_recoverd *rec,
1543 TALLOC_CTX *mem_ctx, uint32_t pnn,
1544 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1546 struct ctdb_context *ctdb = rec->ctdb;
1548 uint32_t generation;
1549 struct ctdb_dbid_map *dbmap;
1552 struct timeval start_time;
1553 uint32_t culprit = (uint32_t)-1;
1555 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1557 /* if recovery fails, force it again */
1558 rec->need_recovery = true;
1560 for (i=0; i<ctdb->num_nodes; i++) {
1561 struct ctdb_banning_state *ban_state;
1563 if (ctdb->nodes[i]->ban_state == NULL) {
1566 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1567 if (ban_state->count < 2*ctdb->num_nodes) {
1570 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1571 ctdb->nodes[i]->pnn, ban_state->count,
1572 ctdb->tunable.recovery_ban_period));
1573 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1574 ban_state->count = 0;
1578 if (ctdb->tunable.verify_recovery_lock != 0) {
1579 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1580 start_time = timeval_current();
1581 if (!ctdb_recovery_lock(ctdb, true)) {
1582 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1583 "and ban ourself for %u seconds\n",
1584 ctdb->tunable.recovery_ban_period));
1585 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1588 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1589 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1592 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1594 /* get a list of all databases */
1595 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1597 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1601 /* we do the db creation before we set the recovery mode, so the freeze happens
1602 on all databases we will be dealing with. */
1604 /* verify that we have all the databases any other node has */
1605 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1607 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1611 /* verify that all other nodes have all our databases */
1612 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1614 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1617 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1619 /* update the database priority for all remote databases */
1620 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1622 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1624 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1627 /* update all other nodes to use the same setting for reclock files
1628 as the local recovery master.
1630 sync_recovery_lock_file_across_cluster(rec);
1632 /* set recovery mode to active on all nodes */
1633 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1635 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1639 /* execute the "startrecovery" event script on all nodes */
1640 ret = run_startrecovery_eventscript(rec, nodemap);
1642 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1647 update all nodes to have the same flags that we have
1649 for (i=0;i<nodemap->num;i++) {
1650 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1654 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1656 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1661 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1663 /* pick a new generation number */
1664 generation = new_generation();
1666 /* change the vnnmap on this node to use the new generation
1667 number but not on any other nodes.
1668 this guarantees that if we abort the recovery prematurely
1669 for some reason (a node stops responding?)
1670 that we can just return immediately and we will reenter
1671 recovery shortly again.
1672 I.e. we deliberately leave the cluster with an inconsistent
1673 generation id to allow us to abort recovery at any stage and
1674 just restart it from scratch.
1676 vnnmap->generation = generation;
1677 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1679 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1683 data.dptr = (void *)&generation;
1684 data.dsize = sizeof(uint32_t);
1686 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1687 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1689 CONTROL_TIMEOUT(), false, data,
1691 transaction_start_fail_callback,
1693 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1694 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1696 CONTROL_TIMEOUT(), false, tdb_null,
1700 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1705 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1707 for (i=0;i<dbmap->num;i++) {
1708 ret = recover_database(rec, mem_ctx,
1710 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1711 pnn, nodemap, generation);
1713 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1718 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1720 /* commit all the changes */
1721 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1723 CONTROL_TIMEOUT(), false, data,
1726 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1730 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1733 /* update the capabilities for all nodes */
1734 ret = update_capabilities(ctdb, nodemap);
1736 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1740 /* build a new vnn map with all the currently active and
1742 generation = new_generation();
1743 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1744 CTDB_NO_MEMORY(ctdb, vnnmap);
1745 vnnmap->generation = generation;
1747 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1748 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1749 for (i=j=0;i<nodemap->num;i++) {
1750 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1753 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1754 /* this node can not be an lmaster */
1755 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1760 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1761 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1762 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1765 if (vnnmap->size == 0) {
1766 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1768 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1769 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1770 vnnmap->map[0] = pnn;
1773 /* update to the new vnnmap on all nodes */
1774 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1776 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1780 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1782 /* update recmaster to point to us for all nodes */
1783 ret = set_recovery_master(ctdb, nodemap, pnn);
1785 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1789 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1792 update all nodes to have the same flags that we have
1794 for (i=0;i<nodemap->num;i++) {
1795 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1799 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1801 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1806 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1808 /* disable recovery mode */
1809 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1811 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1815 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1818 tell nodes to takeover their public IPs
1820 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1822 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1824 rec->need_takeover_run = true;
1827 rec->need_takeover_run = false;
1828 ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
1830 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1831 rec->need_takeover_run = true;
1834 /* execute the "recovered" event script on all nodes */
1835 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1837 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1841 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1843 /* send a message to all clients telling them that the cluster
1844 has been reconfigured */
1845 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1847 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1849 rec->need_recovery = false;
1851 /* we managed to complete a full recovery, make sure to forgive
1852 any past sins by the nodes that could now participate in the
1855 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1856 for (i=0;i<nodemap->num;i++) {
1857 struct ctdb_banning_state *ban_state;
1859 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1863 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1864 if (ban_state == NULL) {
1868 ban_state->count = 0;
1872 /* We just finished a recovery successfully.
1873 We now wait for rerecovery_timeout before we allow
1874 another recovery to take place.
1876 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1877 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1878 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1885 elections are won by first checking the number of connected nodes, then
1886 the priority time, then the pnn
1888 struct election_message {
1889 uint32_t num_connected;
1890 struct timeval priority_time;
1892 uint32_t node_flags;
1896 form this nodes election data
1898 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1901 struct ctdb_node_map *nodemap;
1902 struct ctdb_context *ctdb = rec->ctdb;
1906 em->pnn = rec->ctdb->pnn;
1907 em->priority_time = rec->priority_time;
1909 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1911 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1915 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1916 em->node_flags = rec->node_flags;
1918 for (i=0;i<nodemap->num;i++) {
1919 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1920 em->num_connected++;
1924 /* we shouldnt try to win this election if we cant be a recmaster */
1925 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1926 em->num_connected = 0;
1927 em->priority_time = timeval_current();
1930 talloc_free(nodemap);
1934 see if the given election data wins
1936 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1938 struct election_message myem;
1941 ctdb_election_data(rec, &myem);
1943 /* we cant win if we dont have the recmaster capability */
1944 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1948 /* we cant win if we are banned */
1949 if (rec->node_flags & NODE_FLAGS_BANNED) {
1953 /* we cant win if we are stopped */
1954 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1958 /* we will automatically win if the other node is banned */
1959 if (em->node_flags & NODE_FLAGS_BANNED) {
1963 /* we will automatically win if the other node is banned */
1964 if (em->node_flags & NODE_FLAGS_STOPPED) {
1968 /* try to use the most connected node */
1970 cmp = (int)myem.num_connected - (int)em->num_connected;
1973 /* then the longest running node */
1975 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1979 cmp = (int)myem.pnn - (int)em->pnn;
1986 send out an election request
1988 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1991 TDB_DATA election_data;
1992 struct election_message emsg;
1994 struct ctdb_context *ctdb = rec->ctdb;
1996 srvid = CTDB_SRVID_RECOVERY;
1998 ctdb_election_data(rec, &emsg);
2000 election_data.dsize = sizeof(struct election_message);
2001 election_data.dptr = (unsigned char *)&emsg;
2004 /* send an election message to all active nodes */
2005 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2006 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2009 /* A new node that is already frozen has entered the cluster.
2010 The existing nodes are not frozen and dont need to be frozen
2011 until the election has ended and we start the actual recovery
2013 if (update_recmaster == true) {
2014 /* first we assume we will win the election and set
2015 recoverymaster to be ourself on the current node
2017 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2019 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2029 this function will unban all nodes in the cluster
2031 static void unban_all_nodes(struct ctdb_context *ctdb)
2034 struct ctdb_node_map *nodemap;
2035 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2037 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2039 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2043 for (i=0;i<nodemap->num;i++) {
2044 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2045 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2046 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2050 talloc_free(tmp_ctx);
2055 we think we are winning the election - send a broadcast election request
2057 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2059 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2062 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2064 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2067 talloc_free(rec->send_election_te);
2068 rec->send_election_te = NULL;
2072 handler for memory dumps
2074 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2075 TDB_DATA data, void *private_data)
2077 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2080 struct rd_memdump_reply *rd;
2082 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2083 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2084 talloc_free(tmp_ctx);
2087 rd = (struct rd_memdump_reply *)data.dptr;
2089 dump = talloc_zero(tmp_ctx, TDB_DATA);
2091 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2092 talloc_free(tmp_ctx);
2095 ret = ctdb_dump_memory(ctdb, dump);
2097 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2098 talloc_free(tmp_ctx);
2102 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2104 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2106 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2107 talloc_free(tmp_ctx);
2111 talloc_free(tmp_ctx);
2117 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2118 TDB_DATA data, void *private_data)
2120 struct ctdb_get_log_addr *log_addr;
2123 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2124 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2127 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2129 child = ctdb_fork_no_free_ringbuffer(ctdb);
2130 if (child == (pid_t)-1) {
2131 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2136 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2137 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2140 ctdb_collect_log(ctdb, log_addr);
2146 handler for clearlog
2148 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2149 TDB_DATA data, void *private_data)
2151 ctdb_clear_log(ctdb);
2155 handler for reload_nodes
2157 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2158 TDB_DATA data, void *private_data)
2160 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2162 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2164 reload_nodes_file(rec->ctdb);
2168 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2169 struct timeval yt, void *p)
2171 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2173 talloc_free(rec->ip_check_disable_ctx);
2174 rec->ip_check_disable_ctx = NULL;
2178 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2179 struct timeval t, void *p)
2181 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2182 struct ctdb_context *ctdb = rec->ctdb;
2185 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2187 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2189 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2190 rec->need_takeover_run = true;
2193 talloc_free(rec->deferred_rebalance_ctx);
2194 rec->deferred_rebalance_ctx = NULL;
2198 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2199 TDB_DATA data, void *private_data)
2202 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2204 if (data.dsize != sizeof(uint32_t)) {
2205 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2209 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2213 pnn = *(uint32_t *)&data.dptr[0];
2215 lcp2_forcerebalance(ctdb, pnn);
2216 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2218 if (rec->deferred_rebalance_ctx != NULL) {
2219 talloc_free(rec->deferred_rebalance_ctx);
2221 rec->deferred_rebalance_ctx = talloc_new(rec);
2222 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2223 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2224 ctdb_rebalance_timeout, rec);
2229 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2230 TDB_DATA data, void *private_data)
2232 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2233 struct ctdb_public_ip *ip;
2235 if (rec->recmaster != rec->ctdb->pnn) {
2236 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2240 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2241 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2245 ip = (struct ctdb_public_ip *)data.dptr;
2247 update_ip_assignment_tree(rec->ctdb, ip);
2251 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2252 TDB_DATA data, void *private_data)
2254 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2257 if (rec->ip_check_disable_ctx != NULL) {
2258 talloc_free(rec->ip_check_disable_ctx);
2259 rec->ip_check_disable_ctx = NULL;
2262 if (data.dsize != sizeof(uint32_t)) {
2263 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2264 "expexting %lu\n", (long unsigned)data.dsize,
2265 (long unsigned)sizeof(uint32_t)));
2268 if (data.dptr == NULL) {
2269 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2273 timeout = *((uint32_t *)data.dptr);
2276 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2280 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2282 rec->ip_check_disable_ctx = talloc_new(rec);
2283 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2285 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2290 handler for reload all ips.
2292 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2293 TDB_DATA data, void *private_data)
2295 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2297 if (data.dsize != sizeof(struct reloadips_all_reply)) {
2298 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2302 reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2304 DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2308 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2310 uint32_t *status = callback_data;
2313 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2319 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2321 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2326 DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2327 for (i = 0; i< nodemap->num; i++) {
2328 if (nodemap->nodes[i].flags != 0) {
2329 DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2330 talloc_free(tmp_ctx);
2335 /* send the flags update to all connected nodes */
2336 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2338 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2342 async_reloadips_callback, NULL,
2344 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2345 talloc_free(tmp_ctx);
2350 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2351 talloc_free(tmp_ctx);
2355 ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2357 talloc_free(tmp_ctx);
2363 handler for ip reallocate, just add it to the list of callers and
2364 handle this later in the monitor_cluster loop so we do not recurse
2365 with other callers to takeover_run()
2367 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2368 TDB_DATA data, void *private_data)
2370 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2371 struct ip_reallocate_list *caller;
2373 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2374 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2378 if (rec->ip_reallocate_ctx == NULL) {
2379 rec->ip_reallocate_ctx = talloc_new(rec);
2380 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2383 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2384 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2386 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2387 caller->next = rec->reallocate_callers;
2388 rec->reallocate_callers = caller;
2393 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2395 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2398 struct ip_reallocate_list *callers;
2401 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2403 /* update the list of public ips that a node can handle for
2406 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2408 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2410 rec->need_takeover_run = true;
2413 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2415 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2416 rec->need_takeover_run = true;
2420 result.dsize = sizeof(int32_t);
2421 result.dptr = (uint8_t *)&ret;
2423 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2425 /* Someone that sent srvid==0 does not want a reply */
2426 if (callers->rd->srvid == 0) {
2429 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2430 "%u:%llu\n", (unsigned)callers->rd->pnn,
2431 (unsigned long long)callers->rd->srvid));
2432 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2434 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2435 "message to %u:%llu\n",
2436 (unsigned)callers->rd->pnn,
2437 (unsigned long long)callers->rd->srvid));
2441 talloc_free(tmp_ctx);
2442 talloc_free(rec->ip_reallocate_ctx);
2443 rec->ip_reallocate_ctx = NULL;
2444 rec->reallocate_callers = NULL;
2450 handler for recovery master elections
2452 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2453 TDB_DATA data, void *private_data)
2455 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2457 struct election_message *em = (struct election_message *)data.dptr;
2458 TALLOC_CTX *mem_ctx;
2460 /* we got an election packet - update the timeout for the election */
2461 talloc_free(rec->election_timeout);
2462 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2464 timeval_current_ofs(0, 500000) :
2465 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2466 ctdb_election_timeout, rec);
2468 mem_ctx = talloc_new(ctdb);
2470 /* someone called an election. check their election data
2471 and if we disagree and we would rather be the elected node,
2472 send a new election message to all other nodes
2474 if (ctdb_election_win(rec, em)) {
2475 if (!rec->send_election_te) {
2476 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2477 timeval_current_ofs(0, 500000),
2478 election_send_request, rec);
2480 talloc_free(mem_ctx);
2481 /*unban_all_nodes(ctdb);*/
2486 talloc_free(rec->send_election_te);
2487 rec->send_election_te = NULL;
2489 if (ctdb->tunable.verify_recovery_lock != 0) {
2490 /* release the recmaster lock */
2491 if (em->pnn != ctdb->pnn &&
2492 ctdb->recovery_lock_fd != -1) {
2493 close(ctdb->recovery_lock_fd);
2494 ctdb->recovery_lock_fd = -1;
2495 unban_all_nodes(ctdb);
2499 /* ok, let that guy become recmaster then */
2500 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2502 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2503 talloc_free(mem_ctx);
2507 talloc_free(mem_ctx);
2513 force the start of the election process
2515 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2516 struct ctdb_node_map *nodemap)
2519 struct ctdb_context *ctdb = rec->ctdb;
2521 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2523 /* set all nodes to recovery mode to stop all internode traffic */
2524 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2526 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2530 talloc_free(rec->election_timeout);
2531 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2533 timeval_current_ofs(0, 500000) :
2534 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2535 ctdb_election_timeout, rec);
2537 ret = send_election_request(rec, pnn, true);
2539 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2543 /* wait for a few seconds to collect all responses */
2544 ctdb_wait_election(rec);
2550 handler for when a node changes its flags
2552 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2553 TDB_DATA data, void *private_data)
2556 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2557 struct ctdb_node_map *nodemap=NULL;
2558 TALLOC_CTX *tmp_ctx;
2560 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2561 int disabled_flag_changed;
2563 if (data.dsize != sizeof(*c)) {
2564 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2568 tmp_ctx = talloc_new(ctdb);
2569 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2571 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2573 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2574 talloc_free(tmp_ctx);
2579 for (i=0;i<nodemap->num;i++) {
2580 if (nodemap->nodes[i].pnn == c->pnn) break;
2583 if (i == nodemap->num) {
2584 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2585 talloc_free(tmp_ctx);
2589 if (nodemap->nodes[i].flags != c->new_flags) {
2590 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, nodemap->nodes[i].flags));
2593 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2595 nodemap->nodes[i].flags = c->new_flags;
2597 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2598 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2601 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2602 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2606 ctdb->recovery_master == ctdb->pnn &&
2607 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2608 /* Only do the takeover run if the perm disabled or unhealthy
2609 flags changed since these will cause an ip failover but not
2611 If the node became disconnected or banned this will also
2612 lead to an ip address failover but that is handled
2615 if (disabled_flag_changed) {
2616 rec->need_takeover_run = true;
2620 talloc_free(tmp_ctx);
2624 handler for when we need to push out flag changes ot all other nodes
2626 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2627 TDB_DATA data, void *private_data)
2630 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2631 struct ctdb_node_map *nodemap=NULL;
2632 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2636 /* find the recovery master */
2637 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2639 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2640 talloc_free(tmp_ctx);
2644 /* read the node flags from the recmaster */
2645 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2647 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2648 talloc_free(tmp_ctx);
2651 if (c->pnn >= nodemap->num) {
2652 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2653 talloc_free(tmp_ctx);
2657 /* send the flags update to all connected nodes */
2658 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2660 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2661 nodes, 0, CONTROL_TIMEOUT(),
2665 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2667 talloc_free(tmp_ctx);
2671 talloc_free(tmp_ctx);
2675 struct verify_recmode_normal_data {
2677 enum monitor_result status;
2680 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2682 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2685 /* one more node has responded with recmode data*/
2688 /* if we failed to get the recmode, then return an error and let
2689 the main loop try again.
2691 if (state->state != CTDB_CONTROL_DONE) {
2692 if (rmdata->status == MONITOR_OK) {
2693 rmdata->status = MONITOR_FAILED;
2698 /* if we got a response, then the recmode will be stored in the
2701 if (state->status != CTDB_RECOVERY_NORMAL) {
2702 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2703 rmdata->status = MONITOR_RECOVERY_NEEDED;
2710 /* verify that all nodes are in normal recovery mode */
2711 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2713 struct verify_recmode_normal_data *rmdata;
2714 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2715 struct ctdb_client_control_state *state;
2716 enum monitor_result status;
2719 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2720 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2722 rmdata->status = MONITOR_OK;
2724 /* loop over all active nodes and send an async getrecmode call to
2726 for (j=0; j<nodemap->num; j++) {
2727 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2730 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2732 nodemap->nodes[j].pnn);
2733 if (state == NULL) {
2734 /* we failed to send the control, treat this as
2735 an error and try again next iteration
2737 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2738 talloc_free(mem_ctx);
2739 return MONITOR_FAILED;
2742 /* set up the callback functions */
2743 state->async.fn = verify_recmode_normal_callback;
2744 state->async.private_data = rmdata;
2746 /* one more control to wait for to complete */
2751 /* now wait for up to the maximum number of seconds allowed
2752 or until all nodes we expect a response from has replied
2754 while (rmdata->count > 0) {
2755 event_loop_once(ctdb->ev);
2758 status = rmdata->status;
2759 talloc_free(mem_ctx);
2764 struct verify_recmaster_data {
2765 struct ctdb_recoverd *rec;
2768 enum monitor_result status;
2771 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2773 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2776 /* one more node has responded with recmaster data*/
2779 /* if we failed to get the recmaster, then return an error and let
2780 the main loop try again.
2782 if (state->state != CTDB_CONTROL_DONE) {
2783 if (rmdata->status == MONITOR_OK) {
2784 rmdata->status = MONITOR_FAILED;
2789 /* if we got a response, then the recmaster will be stored in the
2792 if (state->status != rmdata->pnn) {
2793 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2794 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2795 rmdata->status = MONITOR_ELECTION_NEEDED;
2802 /* verify that all nodes agree that we are the recmaster */
2803 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2805 struct ctdb_context *ctdb = rec->ctdb;
2806 struct verify_recmaster_data *rmdata;
2807 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2808 struct ctdb_client_control_state *state;
2809 enum monitor_result status;
2812 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2813 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2817 rmdata->status = MONITOR_OK;
2819 /* loop over all active nodes and send an async getrecmaster call to
2821 for (j=0; j<nodemap->num; j++) {
2822 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2825 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2827 nodemap->nodes[j].pnn);
2828 if (state == NULL) {
2829 /* we failed to send the control, treat this as
2830 an error and try again next iteration
2832 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2833 talloc_free(mem_ctx);
2834 return MONITOR_FAILED;
2837 /* set up the callback functions */
2838 state->async.fn = verify_recmaster_callback;
2839 state->async.private_data = rmdata;
2841 /* one more control to wait for to complete */
2846 /* now wait for up to the maximum number of seconds allowed
2847 or until all nodes we expect a response from has replied
2849 while (rmdata->count > 0) {
2850 event_loop_once(ctdb->ev);
2853 status = rmdata->status;
2854 talloc_free(mem_ctx);
2858 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2859 struct ctdb_recoverd *rec)
2861 struct ctdb_control_get_ifaces *ifaces = NULL;
2862 TALLOC_CTX *mem_ctx;
2865 mem_ctx = talloc_new(NULL);
2867 /* Read the interfaces from the local node */
2868 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2869 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2870 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2871 /* We could return an error. However, this will be
2872 * rare so we'll decide that the interfaces have
2873 * actually changed, just in case.
2875 talloc_free(mem_ctx);
2880 /* We haven't been here before so things have changed */
2882 } else if (rec->ifaces->num != ifaces->num) {
2883 /* Number of interfaces has changed */
2886 /* See if interface names or link states have changed */
2888 for (i = 0; i < rec->ifaces->num; i++) {
2889 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
2890 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0 ||
2891 iface->link_state != ifaces->ifaces[i].link_state) {
2898 talloc_free(rec->ifaces);
2899 rec->ifaces = talloc_steal(rec, ifaces);
2901 talloc_free(mem_ctx);
2905 /* called to check that the local allocation of public ip addresses is ok.
2907 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2909 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2910 struct ctdb_uptime *uptime1 = NULL;
2911 struct ctdb_uptime *uptime2 = NULL;
2913 bool need_takeover_run = false;
2915 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2916 CTDB_CURRENT_NODE, &uptime1);
2918 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2919 talloc_free(mem_ctx);
2923 if (interfaces_have_changed(ctdb, rec)) {
2924 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2925 "local node %u - force takeover run\n",
2927 need_takeover_run = true;
2930 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2931 CTDB_CURRENT_NODE, &uptime2);
2933 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2934 talloc_free(mem_ctx);
2938 /* skip the check if the startrecovery time has changed */
2939 if (timeval_compare(&uptime1->last_recovery_started,
2940 &uptime2->last_recovery_started) != 0) {
2941 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2942 talloc_free(mem_ctx);
2946 /* skip the check if the endrecovery time has changed */
2947 if (timeval_compare(&uptime1->last_recovery_finished,
2948 &uptime2->last_recovery_finished) != 0) {
2949 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2950 talloc_free(mem_ctx);
2954 /* skip the check if we have started but not finished recovery */
2955 if (timeval_compare(&uptime1->last_recovery_finished,
2956 &uptime1->last_recovery_started) != 1) {
2957 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2958 talloc_free(mem_ctx);
2963 /* verify that we have the ip addresses we should have
2964 and we dont have ones we shouldnt have.
2965 if we find an inconsistency we set recmode to
2966 active on the local node and wait for the recmaster
2967 to do a full blown recovery.
2968 also if the pnn is -1 and we are healthy and can host the ip
2969 we also request a ip reallocation.
2971 if (ctdb->tunable.disable_ip_failover == 0) {
2972 struct ctdb_all_public_ips *ips = NULL;
2974 /* read the *available* IPs from the local node */
2975 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2977 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
2978 talloc_free(mem_ctx);
2982 for (j=0; j<ips->num; j++) {
2983 if (ips->ips[j].pnn == -1 &&
2984 nodemap->nodes[pnn].flags == 0) {
2985 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
2986 ctdb_addr_to_str(&ips->ips[j].addr)));
2987 need_takeover_run = true;
2993 /* read the *known* IPs from the local node */
2994 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2996 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
2997 talloc_free(mem_ctx);
3001 for (j=0; j<ips->num; j++) {
3002 if (ips->ips[j].pnn == pnn) {
3003 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3004 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3005 ctdb_addr_to_str(&ips->ips[j].addr)));
3006 need_takeover_run = true;
3009 if (ctdb->do_checkpublicip &&
3010 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3012 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3013 ctdb_addr_to_str(&ips->ips[j].addr)));
3015 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3016 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3023 if (need_takeover_run) {
3024 struct takeover_run_reply rd;
3027 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3031 data.dptr = (uint8_t *)&rd;
3032 data.dsize = sizeof(rd);
3034 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3036 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3039 talloc_free(mem_ctx);
3044 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3046 struct ctdb_node_map **remote_nodemaps = callback_data;
3048 if (node_pnn >= ctdb->num_nodes) {
3049 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3053 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3057 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3058 struct ctdb_node_map *nodemap,
3059 struct ctdb_node_map **remote_nodemaps)
3063 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3064 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3066 CONTROL_TIMEOUT(), false, tdb_null,
3067 async_getnodemap_callback,
3069 remote_nodemaps) != 0) {
3070 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3078 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3079 struct ctdb_check_reclock_state {
3080 struct ctdb_context *ctdb;
3081 struct timeval start_time;
3084 struct timed_event *te;
3085 struct fd_event *fde;
3086 enum reclock_child_status status;
3089 /* when we free the reclock state we must kill any child process.
3091 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3093 struct ctdb_context *ctdb = state->ctdb;
3095 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3097 if (state->fd[0] != -1) {
3098 close(state->fd[0]);
3101 if (state->fd[1] != -1) {
3102 close(state->fd[1]);
3105 ctdb_kill(ctdb, state->child, SIGKILL);
3110 called if our check_reclock child times out. this would happen if
3111 i/o to the reclock file blocks.
3113 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3114 struct timeval t, void *private_data)
3116 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3117 struct ctdb_check_reclock_state);
3119 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3120 state->status = RECLOCK_TIMEOUT;
3123 /* this is called when the child process has completed checking the reclock
3124 file and has written data back to us through the pipe.
3126 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3127 uint16_t flags, void *private_data)
3129 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3130 struct ctdb_check_reclock_state);
3134 /* we got a response from our child process so we can abort the
3137 talloc_free(state->te);
3140 ret = read(state->fd[0], &c, 1);
3141 if (ret != 1 || c != RECLOCK_OK) {
3142 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3143 state->status = RECLOCK_FAILED;
3148 state->status = RECLOCK_OK;
3152 static int check_recovery_lock(struct ctdb_context *ctdb)
3155 struct ctdb_check_reclock_state *state;
3156 pid_t parent = getpid();
3158 if (ctdb->recovery_lock_fd == -1) {
3159 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3163 state = talloc(ctdb, struct ctdb_check_reclock_state);
3164 CTDB_NO_MEMORY(ctdb, state);
3167 state->start_time = timeval_current();
3168 state->status = RECLOCK_CHECKING;
3172 ret = pipe(state->fd);
3175 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3179 state->child = ctdb_fork(ctdb);
3180 if (state->child == (pid_t)-1) {
3181 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3182 close(state->fd[0]);
3184 close(state->fd[1]);
3190 if (state->child == 0) {
3191 char cc = RECLOCK_OK;
3192 close(state->fd[0]);
3195 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3196 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3197 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3198 cc = RECLOCK_FAILED;
3201 write(state->fd[1], &cc, 1);
3202 /* make sure we die when our parent dies */
3203 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3205 write(state->fd[1], &cc, 1);
3209 close(state->fd[1]);
3211 set_close_on_exec(state->fd[0]);
3213 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3215 talloc_set_destructor(state, check_reclock_destructor);
3217 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3218 ctdb_check_reclock_timeout, state);
3219 if (state->te == NULL) {
3220 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3225 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3227 reclock_child_handler,
3230 if (state->fde == NULL) {
3231 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3235 tevent_fd_set_auto_close(state->fde);
3237 while (state->status == RECLOCK_CHECKING) {
3238 event_loop_once(ctdb->ev);
3241 if (state->status == RECLOCK_FAILED) {
3242 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3243 close(ctdb->recovery_lock_fd);
3244 ctdb->recovery_lock_fd = -1;
3253 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3255 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3256 const char *reclockfile;
3258 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3259 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3260 talloc_free(tmp_ctx);
3264 if (reclockfile == NULL) {
3265 if (ctdb->recovery_lock_file != NULL) {
3266 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3267 talloc_free(ctdb->recovery_lock_file);
3268 ctdb->recovery_lock_file = NULL;
3269 if (ctdb->recovery_lock_fd != -1) {
3270 close(ctdb->recovery_lock_fd);
3271 ctdb->recovery_lock_fd = -1;
3274 ctdb->tunable.verify_recovery_lock = 0;
3275 talloc_free(tmp_ctx);
3279 if (ctdb->recovery_lock_file == NULL) {
3280 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3281 if (ctdb->recovery_lock_fd != -1) {
3282 close(ctdb->recovery_lock_fd);
3283 ctdb->recovery_lock_fd = -1;
3285 talloc_free(tmp_ctx);
3290 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3291 talloc_free(tmp_ctx);
3295 talloc_free(ctdb->recovery_lock_file);
3296 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3297 ctdb->tunable.verify_recovery_lock = 0;
3298 if (ctdb->recovery_lock_fd != -1) {
3299 close(ctdb->recovery_lock_fd);
3300 ctdb->recovery_lock_fd = -1;
3303 talloc_free(tmp_ctx);
3307 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3308 TALLOC_CTX *mem_ctx)
3311 struct ctdb_node_map *nodemap=NULL;
3312 struct ctdb_node_map *recmaster_nodemap=NULL;
3313 struct ctdb_node_map **remote_nodemaps=NULL;
3314 struct ctdb_vnn_map *vnnmap=NULL;
3315 struct ctdb_vnn_map *remote_vnnmap=NULL;
3316 int32_t debug_level;
3321 /* verify that the main daemon is still running */
3322 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3323 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3327 /* ping the local daemon to tell it we are alive */
3328 ctdb_ctrl_recd_ping(ctdb);
3330 if (rec->election_timeout) {
3331 /* an election is in progress */
3335 /* read the debug level from the parent and update locally */
3336 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3338 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3341 LogLevel = debug_level;
3344 /* We must check if we need to ban a node here but we want to do this
3345 as early as possible so we dont wait until we have pulled the node
3346 map from the local node. thats why we have the hardcoded value 20
3348 for (i=0; i<ctdb->num_nodes; i++) {
3349 struct ctdb_banning_state *ban_state;
3351 if (ctdb->nodes[i]->ban_state == NULL) {
3354 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3355 if (ban_state->count < 20) {
3358 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3359 ctdb->nodes[i]->pnn, ban_state->count,
3360 ctdb->tunable.recovery_ban_period));
3361 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3362 ban_state->count = 0;
3365 /* get relevant tunables */
3366 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3368 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3372 /* get the current recovery lock file from the server */
3373 if (update_recovery_lock_file(ctdb) != 0) {
3374 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3378 /* Make sure that if recovery lock verification becomes disabled when
3381 if (ctdb->tunable.verify_recovery_lock == 0) {
3382 if (ctdb->recovery_lock_fd != -1) {
3383 close(ctdb->recovery_lock_fd);
3384 ctdb->recovery_lock_fd = -1;
3388 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3389 if (pnn == (uint32_t)-1) {
3390 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3394 /* get the vnnmap */
3395 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3397 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3402 /* get number of nodes */
3404 talloc_free(rec->nodemap);
3405 rec->nodemap = NULL;
3408 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3410 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3413 nodemap = rec->nodemap;
3415 /* update the capabilities for all nodes */
3416 ret = update_capabilities(ctdb, nodemap);
3418 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3422 /* check which node is the recovery master */
3423 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3425 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3429 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3430 if (rec->recmaster != pnn) {
3431 if (rec->ip_reallocate_ctx != NULL) {
3432 talloc_free(rec->ip_reallocate_ctx);
3433 rec->ip_reallocate_ctx = NULL;
3434 rec->reallocate_callers = NULL;
3438 if (rec->recmaster == (uint32_t)-1) {
3439 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3440 force_election(rec, pnn, nodemap);
3444 /* if the local daemon is STOPPED, we verify that the databases are
3445 also frozen and thet the recmode is set to active
3447 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3448 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3450 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3452 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3453 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3455 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3457 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED state\n"));
3460 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3462 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED state\n"));
3469 /* If the local node is stopped, verify we are not the recmaster
3470 and yield this role if so
3472 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
3473 DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
3474 force_election(rec, pnn, nodemap);
3479 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3480 * but we have force an election and try to become the new
3483 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3484 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3485 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3486 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3487 " but we (node %u) have - force an election\n",
3488 rec->recmaster, pnn));
3489 force_election(rec, pnn, nodemap);
3493 /* check that we (recovery daemon) and the local ctdb daemon
3494 agrees on whether we are banned or not
3497 /* remember our own node flags */
3498 rec->node_flags = nodemap->nodes[pnn].flags;
3500 /* count how many active nodes there are */
3501 rec->num_active = 0;
3502 rec->num_connected = 0;
3503 for (i=0; i<nodemap->num; i++) {
3504 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3507 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3508 rec->num_connected++;
3513 /* verify that the recmaster node is still active */
3514 for (j=0; j<nodemap->num; j++) {
3515 if (nodemap->nodes[j].pnn==rec->recmaster) {
3520 if (j == nodemap->num) {
3521 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3522 force_election(rec, pnn, nodemap);
3526 /* if recovery master is disconnected we must elect a new recmaster */
3527 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3528 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3529 force_election(rec, pnn, nodemap);
3533 /* get nodemap from the recovery master to check if it is inactive */
3534 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3535 mem_ctx, &recmaster_nodemap);
3537 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3538 nodemap->nodes[j].pnn));
3543 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3544 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3545 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3546 force_election(rec, pnn, nodemap);
3550 /* If this node is stopped then it is not the recovery master
3551 * so the only remaining action is to potentially to verify
3552 * the local IP allocation below. This won't accomplish
3553 * anything useful so skip it.
3555 if (rec->node_flags & NODE_FLAGS_STOPPED) {
3559 /* verify that we have all ip addresses we should have and we dont
3560 * have addresses we shouldnt have.
3562 if (ctdb->tunable.disable_ip_failover == 0) {
3563 if (rec->ip_check_disable_ctx == NULL) {
3564 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3565 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3571 /* if we are not the recmaster then we do not need to check
3572 if recovery is needed
3574 if (pnn != rec->recmaster) {
3579 /* ensure our local copies of flags are right */
3580 ret = update_local_flags(rec, nodemap);
3581 if (ret == MONITOR_ELECTION_NEEDED) {
3582 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3583 force_election(rec, pnn, nodemap);
3586 if (ret != MONITOR_OK) {
3587 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3591 if (ctdb->num_nodes != nodemap->num) {
3592 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3593 reload_nodes_file(ctdb);
3597 /* verify that all active nodes agree that we are the recmaster */
3598 switch (verify_recmaster(rec, nodemap, pnn)) {
3599 case MONITOR_RECOVERY_NEEDED:
3600 /* can not happen */
3602 case MONITOR_ELECTION_NEEDED:
3603 force_election(rec, pnn, nodemap);
3607 case MONITOR_FAILED:
3612 if (rec->need_recovery) {
3613 /* a previous recovery didn't finish */
3614 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3618 /* verify that all active nodes are in normal mode
3619 and not in recovery mode
3621 switch (verify_recmode(ctdb, nodemap)) {
3622 case MONITOR_RECOVERY_NEEDED:
3623 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3625 case MONITOR_FAILED:
3627 case MONITOR_ELECTION_NEEDED:
3628 /* can not happen */
3634 if (ctdb->tunable.verify_recovery_lock != 0) {
3635 /* we should have the reclock - check its not stale */
3636 ret = check_recovery_lock(ctdb);
3638 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3639 ctdb_set_culprit(rec, ctdb->pnn);
3640 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3646 /* is there a pending reload all ips ? */
3647 if (reload_all_ips_request != NULL) {
3648 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3649 talloc_free(reload_all_ips_request);
3650 reload_all_ips_request = NULL;
3653 /* if there are takeovers requested, perform it and notify the waiters */
3654 if (rec->reallocate_callers) {
3655 process_ipreallocate_requests(ctdb, rec);
3658 /* get the nodemap for all active remote nodes
3660 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3661 if (remote_nodemaps == NULL) {
3662 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3665 for(i=0; i<nodemap->num; i++) {
3666 remote_nodemaps[i] = NULL;
3668 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3669 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3673 /* verify that all other nodes have the same nodemap as we have
3675 for (j=0; j<nodemap->num; j++) {
3676 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3680 if (remote_nodemaps[j] == NULL) {
3681 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3682 ctdb_set_culprit(rec, j);
3687 /* if the nodes disagree on how many nodes there are
3688 then this is a good reason to try recovery
3690 if (remote_nodemaps[j]->num != nodemap->num) {
3691 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3692 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3693 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3694 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3698 /* if the nodes disagree on which nodes exist and are
3699 active, then that is also a good reason to do recovery
3701 for (i=0;i<nodemap->num;i++) {
3702 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3703 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3704 nodemap->nodes[j].pnn, i,
3705 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3706 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3707 do_recovery(rec, mem_ctx, pnn, nodemap,
3713 /* verify the flags are consistent
3715 for (i=0; i<nodemap->num; i++) {
3716 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3720 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3721 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3722 nodemap->nodes[j].pnn,
3723 nodemap->nodes[i].pnn,
3724 remote_nodemaps[j]->nodes[i].flags,
3725 nodemap->nodes[i].flags));
3727 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3728 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3729 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3730 do_recovery(rec, mem_ctx, pnn, nodemap,
3734 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3735 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3736 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3737 do_recovery(rec, mem_ctx, pnn, nodemap,
3746 /* there better be the same number of lmasters in the vnn map
3747 as there are active nodes or we will have to do a recovery
3749 if (vnnmap->size != rec->num_active) {
3750 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3751 vnnmap->size, rec->num_active));
3752 ctdb_set_culprit(rec, ctdb->pnn);
3753 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3757 /* verify that all active nodes in the nodemap also exist in
3760 for (j=0; j<nodemap->num; j++) {
3761 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3764 if (nodemap->nodes[j].pnn == pnn) {
3768 for (i=0; i<vnnmap->size; i++) {
3769 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3773 if (i == vnnmap->size) {
3774 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3775 nodemap->nodes[j].pnn));
3776 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3777 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3783 /* verify that all other nodes have the same vnnmap
3784 and are from the same generation
3786 for (j=0; j<nodemap->num; j++) {
3787 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3790 if (nodemap->nodes[j].pnn == pnn) {
3794 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3795 mem_ctx, &remote_vnnmap);
3797 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3798 nodemap->nodes[j].pnn));
3802 /* verify the vnnmap generation is the same */
3803 if (vnnmap->generation != remote_vnnmap->generation) {
3804 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3805 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3806 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3807 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3811 /* verify the vnnmap size is the same */
3812 if (vnnmap->size != remote_vnnmap->size) {
3813 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3814 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3815 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3816 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3820 /* verify the vnnmap is the same */
3821 for (i=0;i<vnnmap->size;i++) {
3822 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3823 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3824 nodemap->nodes[j].pnn));
3825 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3826 do_recovery(rec, mem_ctx, pnn, nodemap,
3833 /* we might need to change who has what IP assigned */
3834 if (rec->need_takeover_run) {
3835 uint32_t culprit = (uint32_t)-1;
3837 rec->need_takeover_run = false;
3839 /* update the list of public ips that a node can handle for
3842 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3844 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3846 rec->need_takeover_run = true;
3850 /* execute the "startrecovery" event script on all nodes */
3851 ret = run_startrecovery_eventscript(rec, nodemap);
3853 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3854 ctdb_set_culprit(rec, ctdb->pnn);
3855 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3859 /* If takeover run fails, then the offending nodes are
3860 * assigned ban culprit counts. And we re-try takeover.
3861 * If takeover run fails repeatedly, the node would get
3864 * If rec->need_takeover_run is not set to true at this
3865 * failure, monitoring is disabled cluster-wide (via
3866 * startrecovery eventscript) and will not get enabled.
3868 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
3870 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
3874 /* execute the "recovered" event script on all nodes */
3875 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3877 // we cant check whether the event completed successfully
3878 // since this script WILL fail if the node is in recovery mode
3879 // and if that race happens, the code here would just cause a second
3880 // cascading recovery.
3882 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3883 ctdb_set_culprit(rec, ctdb->pnn);
3884 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3891 the main monitoring loop
3893 static void monitor_cluster(struct ctdb_context *ctdb)
3895 struct ctdb_recoverd *rec;
3897 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3899 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3900 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3904 rec->priority_time = timeval_current();
3906 /* register a message port for sending memory dumps */
3907 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3909 /* register a message port for requesting logs */
3910 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
3912 /* register a message port for clearing logs */
3913 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
3915 /* register a message port for recovery elections */
3916 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3918 /* when nodes are disabled/enabled */
3919 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3921 /* when we are asked to puch out a flag change */
3922 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3924 /* register a message port for vacuum fetch */
3925 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3927 /* register a message port for reloadnodes */
3928 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3930 /* register a message port for performing a takeover run */
3931 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3933 /* register a message port for performing a reload all ips */
3934 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
3936 /* register a message port for disabling the ip check for a short while */
3937 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3939 /* register a message port for updating the recovery daemons node assignment for an ip */
3940 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3942 /* register a message port for forcing a rebalance of a node next
3944 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3947 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3948 struct timeval start;
3952 DEBUG(DEBUG_CRIT,(__location__
3953 " Failed to create temp context\n"));
3957 start = timeval_current();
3958 main_loop(ctdb, rec, mem_ctx);
3959 talloc_free(mem_ctx);
3961 /* we only check for recovery once every second */
3962 elapsed = timeval_elapsed(&start);
3963 if (elapsed < ctdb->tunable.recover_interval) {
3964 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3971 event handler for when the main ctdbd dies
3973 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3974 uint16_t flags, void *private_data)
3976 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3981 called regularly to verify that the recovery daemon is still running
3983 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3984 struct timeval yt, void *p)
3986 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3988 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3989 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3991 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3992 ctdb_restart_recd, ctdb);
3997 event_add_timed(ctdb->ev, ctdb->recd_ctx,
3998 timeval_current_ofs(30, 0),
3999 ctdb_check_recd, ctdb);
4002 static void recd_sig_child_handler(struct event_context *ev,
4003 struct signal_event *se, int signum, int count,
4007 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4012 pid = waitpid(-1, &status, WNOHANG);
4014 if (errno != ECHILD) {
4015 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4020 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4026 startup the recovery daemon as a child of the main ctdb daemon
4028 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4031 struct signal_event *se;
4032 struct tevent_fd *fde;
4034 if (pipe(fd) != 0) {
4038 ctdb->ctdbd_pid = getpid();
4040 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4041 if (ctdb->recoverd_pid == -1) {
4045 if (ctdb->recoverd_pid != 0) {
4046 talloc_free(ctdb->recd_ctx);
4047 ctdb->recd_ctx = talloc_new(ctdb);
4048 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4051 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4052 timeval_current_ofs(30, 0),
4053 ctdb_check_recd, ctdb);
4059 srandom(getpid() ^ time(NULL));
4061 /* Clear the log ringbuffer */
4062 ctdb_clear_log(ctdb);
4064 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4065 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4069 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4071 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4072 ctdb_recoverd_parent, &fd[0]);
4073 tevent_fd_set_auto_close(fde);
4075 /* set up a handler to pick up sigchld */
4076 se = event_add_signal(ctdb->ev, ctdb,
4078 recd_sig_child_handler,
4081 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4085 monitor_cluster(ctdb);
4087 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4092 shutdown the recovery daemon
4094 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4096 if (ctdb->recoverd_pid == 0) {
4100 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4101 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4103 TALLOC_FREE(ctdb->recd_ctx);
4104 TALLOC_FREE(ctdb->recd_ping_count);
4107 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4108 struct timeval t, void *private_data)
4110 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4112 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4113 ctdb_stop_recoverd(ctdb);
4114 ctdb_start_recoverd(ctdb);