4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
258 change recovery mode on all nodes
260 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
266 tmp_ctx = talloc_new(ctdb);
267 CTDB_NO_MEMORY(ctdb, tmp_ctx);
269 /* freeze all nodes */
270 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
271 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
274 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
275 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
280 set_recmode_fail_callback,
282 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
283 talloc_free(tmp_ctx);
290 data.dsize = sizeof(uint32_t);
291 data.dptr = (unsigned char *)&rec_mode;
293 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
299 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
300 talloc_free(tmp_ctx);
304 talloc_free(tmp_ctx);
309 change recovery master on all node
311 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
317 tmp_ctx = talloc_new(ctdb);
318 CTDB_NO_MEMORY(ctdb, tmp_ctx);
320 data.dsize = sizeof(uint32_t);
321 data.dptr = (unsigned char *)&pnn;
323 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
324 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
326 CONTROL_TIMEOUT(), false, data,
329 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
330 talloc_free(tmp_ctx);
334 talloc_free(tmp_ctx);
338 /* update all remote nodes to use the same db priority that we have
339 this can fail if the remove node has not yet been upgraded to
340 support this function, so we always return success and never fail
341 a recovery if this call fails.
343 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
344 struct ctdb_node_map *nodemap,
345 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
350 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
352 /* step through all local databases */
353 for (db=0; db<dbmap->num;db++) {
355 struct ctdb_db_priority db_prio;
358 db_prio.db_id = dbmap->dbs[db].dbid;
359 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
361 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
365 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
367 data.dptr = (uint8_t *)&db_prio;
368 data.dsize = sizeof(db_prio);
370 if (ctdb_client_async_control(ctdb,
371 CTDB_CONTROL_SET_DB_PRIORITY,
373 CONTROL_TIMEOUT(), false, data,
376 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
384 ensure all other nodes have attached to any databases that we have
386 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
387 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
390 struct ctdb_dbid_map *remote_dbmap;
392 /* verify that all other nodes have all our databases */
393 for (j=0; j<nodemap->num; j++) {
394 /* we dont need to ourself ourselves */
395 if (nodemap->nodes[j].pnn == pnn) {
398 /* dont check nodes that are unavailable */
399 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
403 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
404 mem_ctx, &remote_dbmap);
406 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
410 /* step through all local databases */
411 for (db=0; db<dbmap->num;db++) {
415 for (i=0;i<remote_dbmap->num;i++) {
416 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
420 /* the remote node already have this database */
421 if (i!=remote_dbmap->num) {
424 /* ok so we need to create this database */
425 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
428 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
431 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
432 mem_ctx, name, dbmap->dbs[db].persistent);
434 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
445 ensure we are attached to any databases that anyone else is attached to
447 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
448 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
451 struct ctdb_dbid_map *remote_dbmap;
453 /* verify that we have all database any other node has */
454 for (j=0; j<nodemap->num; j++) {
455 /* we dont need to ourself ourselves */
456 if (nodemap->nodes[j].pnn == pnn) {
459 /* dont check nodes that are unavailable */
460 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
464 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
465 mem_ctx, &remote_dbmap);
467 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
471 /* step through all databases on the remote node */
472 for (db=0; db<remote_dbmap->num;db++) {
475 for (i=0;i<(*dbmap)->num;i++) {
476 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
480 /* we already have this db locally */
481 if (i!=(*dbmap)->num) {
484 /* ok so we need to create this database and
487 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
488 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
490 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
491 nodemap->nodes[j].pnn));
494 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
495 remote_dbmap->dbs[db].persistent);
497 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
500 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
502 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
513 pull the remote database contents from one node into the recdb
515 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
516 struct tdb_wrap *recdb, uint32_t dbid)
520 struct ctdb_marshall_buffer *reply;
521 struct ctdb_rec_data *rec;
523 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
525 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
526 CONTROL_TIMEOUT(), &outdata);
528 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
529 talloc_free(tmp_ctx);
533 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
535 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
536 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
537 talloc_free(tmp_ctx);
541 rec = (struct ctdb_rec_data *)&reply->data[0];
545 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
547 struct ctdb_ltdb_header *hdr;
550 key.dptr = &rec->data[0];
551 key.dsize = rec->keylen;
552 data.dptr = &rec->data[key.dsize];
553 data.dsize = rec->datalen;
555 hdr = (struct ctdb_ltdb_header *)data.dptr;
557 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
558 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
559 talloc_free(tmp_ctx);
563 /* fetch the existing record, if any */
564 existing = tdb_fetch(recdb->tdb, key);
566 if (existing.dptr != NULL) {
567 struct ctdb_ltdb_header header;
568 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
569 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
570 (unsigned)existing.dsize, srcnode));
572 talloc_free(tmp_ctx);
575 header = *(struct ctdb_ltdb_header *)existing.dptr;
577 if (!(header.rsn < hdr->rsn ||
578 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
583 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
584 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
585 talloc_free(tmp_ctx);
590 talloc_free(tmp_ctx);
596 pull all the remote database contents into the recdb
598 static int pull_remote_database(struct ctdb_context *ctdb,
599 struct ctdb_recoverd *rec,
600 struct ctdb_node_map *nodemap,
601 struct tdb_wrap *recdb, uint32_t dbid)
605 /* pull all records from all other nodes across onto this node
606 (this merges based on rsn)
608 for (j=0; j<nodemap->num; j++) {
609 /* dont merge from nodes that are unavailable */
610 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
613 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
614 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
615 nodemap->nodes[j].pnn));
616 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
626 update flags on all active nodes
628 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
632 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
634 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
642 ensure all nodes have the same vnnmap we do
644 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
645 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
649 /* push the new vnn map out to all the nodes */
650 for (j=0; j<nodemap->num; j++) {
651 /* dont push to nodes that are unavailable */
652 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
656 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
658 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
668 struct vacuum_info *next, *prev;
669 struct ctdb_recoverd *rec;
671 struct ctdb_db_context *ctdb_db;
672 struct ctdb_marshall_buffer *recs;
673 struct ctdb_rec_data *r;
676 static void vacuum_fetch_next(struct vacuum_info *v);
679 called when a vacuum fetch has completed - just free it and do the next one
681 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
683 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
685 vacuum_fetch_next(v);
690 process the next element from the vacuum list
692 static void vacuum_fetch_next(struct vacuum_info *v)
694 struct ctdb_call call;
695 struct ctdb_rec_data *r;
697 while (v->recs->count) {
698 struct ctdb_client_call_state *state;
700 struct ctdb_ltdb_header *hdr;
703 call.call_id = CTDB_NULL_FUNC;
704 call.flags = CTDB_IMMEDIATE_MIGRATION;
707 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
710 call.key.dptr = &r->data[0];
711 call.key.dsize = r->keylen;
713 /* ensure we don't block this daemon - just skip a record if we can't get
715 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
719 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
720 if (data.dptr == NULL) {
721 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
725 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
727 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
731 hdr = (struct ctdb_ltdb_header *)data.dptr;
732 if (hdr->dmaster == v->rec->ctdb->pnn) {
733 /* its already local */
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
741 state = ctdb_call_send(v->ctdb_db, &call);
742 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
744 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
748 state->async.fn = vacuum_fetch_callback;
749 state->async.private_data = v;
758 destroy a vacuum info structure
760 static int vacuum_info_destructor(struct vacuum_info *v)
762 DLIST_REMOVE(v->rec->vacuum_info, v);
768 handler for vacuum fetch
770 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
771 TDB_DATA data, void *private_data)
773 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
774 struct ctdb_marshall_buffer *recs;
776 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
778 struct ctdb_dbid_map *dbmap=NULL;
779 bool persistent = false;
780 struct ctdb_db_context *ctdb_db;
781 struct ctdb_rec_data *r;
783 struct vacuum_info *v;
785 recs = (struct ctdb_marshall_buffer *)data.dptr;
786 r = (struct ctdb_rec_data *)&recs->data[0];
788 if (recs->count == 0) {
789 talloc_free(tmp_ctx);
795 for (v=rec->vacuum_info;v;v=v->next) {
796 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
797 /* we're already working on records from this node */
798 talloc_free(tmp_ctx);
803 /* work out if the database is persistent */
804 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
806 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
807 talloc_free(tmp_ctx);
811 for (i=0;i<dbmap->num;i++) {
812 if (dbmap->dbs[i].dbid == recs->db_id) {
813 persistent = dbmap->dbs[i].persistent;
817 if (i == dbmap->num) {
818 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
819 talloc_free(tmp_ctx);
823 /* find the name of this database */
824 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
825 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
826 talloc_free(tmp_ctx);
831 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
832 if (ctdb_db == NULL) {
833 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
834 talloc_free(tmp_ctx);
838 v = talloc_zero(rec, struct vacuum_info);
840 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
841 talloc_free(tmp_ctx);
846 v->srcnode = srcnode;
847 v->ctdb_db = ctdb_db;
848 v->recs = talloc_memdup(v, recs, data.dsize);
849 if (v->recs == NULL) {
850 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
852 talloc_free(tmp_ctx);
855 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
857 DLIST_ADD(rec->vacuum_info, v);
859 talloc_set_destructor(v, vacuum_info_destructor);
861 vacuum_fetch_next(v);
862 talloc_free(tmp_ctx);
867 called when ctdb_wait_timeout should finish
869 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
870 struct timeval yt, void *p)
872 uint32_t *timed_out = (uint32_t *)p;
877 wait for a given number of seconds
879 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
881 uint32_t timed_out = 0;
882 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
884 event_loop_once(ctdb->ev);
889 called when an election times out (ends)
891 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
892 struct timeval t, void *p)
894 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
895 rec->election_timeout = NULL;
897 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
902 wait for an election to finish. It finished election_timeout seconds after
903 the last election packet is received
905 static void ctdb_wait_election(struct ctdb_recoverd *rec)
907 struct ctdb_context *ctdb = rec->ctdb;
908 while (rec->election_timeout) {
909 event_loop_once(ctdb->ev);
914 Update our local flags from all remote connected nodes.
915 This is only run when we are or we belive we are the recovery master
917 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
920 struct ctdb_context *ctdb = rec->ctdb;
921 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
923 /* get the nodemap for all active remote nodes and verify
924 they are the same as for this node
926 for (j=0; j<nodemap->num; j++) {
927 struct ctdb_node_map *remote_nodemap=NULL;
930 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
933 if (nodemap->nodes[j].pnn == ctdb->pnn) {
937 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
938 mem_ctx, &remote_nodemap);
940 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
941 nodemap->nodes[j].pnn));
942 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
943 talloc_free(mem_ctx);
944 return MONITOR_FAILED;
946 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
947 /* We should tell our daemon about this so it
948 updates its flags or else we will log the same
949 message again in the next iteration of recovery.
950 Since we are the recovery master we can just as
951 well update the flags on all nodes.
953 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
955 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
959 /* Update our local copy of the flags in the recovery
962 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
963 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
964 nodemap->nodes[j].flags));
965 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
967 talloc_free(remote_nodemap);
969 talloc_free(mem_ctx);
974 /* Create a new random generation ip.
975 The generation id can not be the INVALID_GENERATION id
977 static uint32_t new_generation(void)
982 generation = random();
984 if (generation != INVALID_GENERATION) {
994 create a temporary working database
996 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
999 struct tdb_wrap *recdb;
1002 /* open up the temporary recovery database */
1003 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1009 tdb_flags = TDB_NOLOCK;
1010 if (!ctdb->do_setsched) {
1011 tdb_flags |= TDB_NOMMAP;
1014 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1015 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1016 if (recdb == NULL) {
1017 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1027 a traverse function for pulling all relevent records from recdb
1030 struct ctdb_context *ctdb;
1031 struct ctdb_marshall_buffer *recdata;
1036 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1038 struct recdb_data *params = (struct recdb_data *)p;
1039 struct ctdb_rec_data *rec;
1040 struct ctdb_ltdb_header *hdr;
1042 /* skip empty records */
1043 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1047 /* update the dmaster field to point to us */
1048 hdr = (struct ctdb_ltdb_header *)data.dptr;
1049 hdr->dmaster = params->ctdb->pnn;
1051 /* add the record to the blob ready to send to the nodes */
1052 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1054 params->failed = true;
1057 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1058 if (params->recdata == NULL) {
1059 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1060 rec->length + params->len, params->recdata->count));
1061 params->failed = true;
1064 params->recdata->count++;
1065 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1066 params->len += rec->length;
1073 push the recdb database out to all nodes
1075 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1076 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1078 struct recdb_data params;
1079 struct ctdb_marshall_buffer *recdata;
1081 TALLOC_CTX *tmp_ctx;
1084 tmp_ctx = talloc_new(ctdb);
1085 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1087 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1088 CTDB_NO_MEMORY(ctdb, recdata);
1090 recdata->db_id = dbid;
1093 params.recdata = recdata;
1094 params.len = offsetof(struct ctdb_marshall_buffer, data);
1095 params.failed = false;
1097 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1098 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1099 talloc_free(params.recdata);
1100 talloc_free(tmp_ctx);
1104 if (params.failed) {
1105 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1106 talloc_free(params.recdata);
1107 talloc_free(tmp_ctx);
1111 recdata = params.recdata;
1113 outdata.dptr = (void *)recdata;
1114 outdata.dsize = params.len;
1116 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1117 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1119 CONTROL_TIMEOUT(), false, outdata,
1122 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1123 talloc_free(recdata);
1124 talloc_free(tmp_ctx);
1128 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1129 dbid, recdata->count));
1131 talloc_free(recdata);
1132 talloc_free(tmp_ctx);
1139 go through a full recovery on one database
1141 static int recover_database(struct ctdb_recoverd *rec,
1142 TALLOC_CTX *mem_ctx,
1145 struct ctdb_node_map *nodemap,
1146 uint32_t transaction_id)
1148 struct tdb_wrap *recdb;
1150 struct ctdb_context *ctdb = rec->ctdb;
1152 struct ctdb_control_wipe_database w;
1155 recdb = create_recdb(ctdb, mem_ctx);
1156 if (recdb == NULL) {
1160 /* pull all remote databases onto the recdb */
1161 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1163 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1167 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1169 /* wipe all the remote databases. This is safe as we are in a transaction */
1171 w.transaction_id = transaction_id;
1173 data.dptr = (void *)&w;
1174 data.dsize = sizeof(w);
1176 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1177 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1179 CONTROL_TIMEOUT(), false, data,
1182 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1187 /* push out the correct database. This sets the dmaster and skips
1188 the empty records */
1189 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1195 /* all done with this database */
1202 reload the nodes file
1204 static void reload_nodes_file(struct ctdb_context *ctdb)
1207 ctdb_load_nodes_file(ctdb);
1212 we are the recmaster, and recovery is needed - start a recovery run
1214 static int do_recovery(struct ctdb_recoverd *rec,
1215 TALLOC_CTX *mem_ctx, uint32_t pnn,
1216 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1218 struct ctdb_context *ctdb = rec->ctdb;
1220 uint32_t generation;
1221 struct ctdb_dbid_map *dbmap;
1224 struct timeval start_time;
1226 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1228 /* if recovery fails, force it again */
1229 rec->need_recovery = true;
1231 for (i=0; i<ctdb->num_nodes; i++) {
1232 struct ctdb_banning_state *ban_state;
1234 if (ctdb->nodes[i]->ban_state == NULL) {
1237 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1238 if (ban_state->count < 2*ctdb->num_nodes) {
1241 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1242 ctdb->nodes[i]->pnn, ban_state->count,
1243 ctdb->tunable.recovery_ban_period));
1244 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1245 ban_state->count = 0;
1249 if (ctdb->tunable.verify_recovery_lock != 0) {
1250 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1251 start_time = timeval_current();
1252 if (!ctdb_recovery_lock(ctdb, true)) {
1253 ctdb_set_culprit(rec, pnn);
1254 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1257 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1258 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1261 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1263 /* get a list of all databases */
1264 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1266 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1270 /* we do the db creation before we set the recovery mode, so the freeze happens
1271 on all databases we will be dealing with. */
1273 /* verify that we have all the databases any other node has */
1274 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1276 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1280 /* verify that all other nodes have all our databases */
1281 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1283 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1286 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1288 /* update the database priority for all remote databases */
1289 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1291 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1293 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1296 /* set recovery mode to active on all nodes */
1297 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1299 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1303 /* execute the "startrecovery" event script on all nodes */
1304 ret = run_startrecovery_eventscript(rec, nodemap);
1306 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1310 /* pick a new generation number */
1311 generation = new_generation();
1313 /* change the vnnmap on this node to use the new generation
1314 number but not on any other nodes.
1315 this guarantees that if we abort the recovery prematurely
1316 for some reason (a node stops responding?)
1317 that we can just return immediately and we will reenter
1318 recovery shortly again.
1319 I.e. we deliberately leave the cluster with an inconsistent
1320 generation id to allow us to abort recovery at any stage and
1321 just restart it from scratch.
1323 vnnmap->generation = generation;
1324 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1326 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1330 data.dptr = (void *)&generation;
1331 data.dsize = sizeof(uint32_t);
1333 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1334 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1336 CONTROL_TIMEOUT(), false, data,
1339 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1343 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1345 for (i=0;i<dbmap->num;i++) {
1346 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1347 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1352 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1354 /* commit all the changes */
1355 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1357 CONTROL_TIMEOUT(), false, data,
1360 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1364 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1367 /* update the capabilities for all nodes */
1368 ret = update_capabilities(ctdb, nodemap);
1370 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1374 /* build a new vnn map with all the currently active and
1376 generation = new_generation();
1377 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1378 CTDB_NO_MEMORY(ctdb, vnnmap);
1379 vnnmap->generation = generation;
1381 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1382 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1383 for (i=j=0;i<nodemap->num;i++) {
1384 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1387 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1388 /* this node can not be an lmaster */
1389 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1394 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1395 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1396 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1399 if (vnnmap->size == 0) {
1400 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1402 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1403 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1404 vnnmap->map[0] = pnn;
1407 /* update to the new vnnmap on all nodes */
1408 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1410 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1414 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1416 /* update recmaster to point to us for all nodes */
1417 ret = set_recovery_master(ctdb, nodemap, pnn);
1419 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1423 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1426 update all nodes to have the same flags that we have
1428 for (i=0;i<nodemap->num;i++) {
1429 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1433 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1435 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1440 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1442 /* disable recovery mode */
1443 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1445 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1449 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1452 tell nodes to takeover their public IPs
1454 rec->need_takeover_run = false;
1455 ret = ctdb_takeover_run(ctdb, nodemap);
1457 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1460 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1462 /* execute the "recovered" event script on all nodes */
1463 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1465 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1469 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1471 /* send a message to all clients telling them that the cluster
1472 has been reconfigured */
1473 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1475 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1477 rec->need_recovery = false;
1479 /* we managed to complete a full recovery, make sure to forgive
1480 any past sins by the nodes that could now participate in the
1483 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1484 for (i=0;i<nodemap->num;i++) {
1485 struct ctdb_banning_state *ban_state;
1487 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1491 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1492 if (ban_state == NULL) {
1496 ban_state->count = 0;
1500 /* We just finished a recovery successfully.
1501 We now wait for rerecovery_timeout before we allow
1502 another recovery to take place.
1504 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1505 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1506 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1513 elections are won by first checking the number of connected nodes, then
1514 the priority time, then the pnn
1516 struct election_message {
1517 uint32_t num_connected;
1518 struct timeval priority_time;
1520 uint32_t node_flags;
1524 form this nodes election data
1526 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1529 struct ctdb_node_map *nodemap;
1530 struct ctdb_context *ctdb = rec->ctdb;
1534 em->pnn = rec->ctdb->pnn;
1535 em->priority_time = rec->priority_time;
1537 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1539 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1543 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1544 em->node_flags = rec->node_flags;
1546 for (i=0;i<nodemap->num;i++) {
1547 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1548 em->num_connected++;
1552 /* we shouldnt try to win this election if we cant be a recmaster */
1553 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1554 em->num_connected = 0;
1555 em->priority_time = timeval_current();
1558 talloc_free(nodemap);
1562 see if the given election data wins
1564 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1566 struct election_message myem;
1569 ctdb_election_data(rec, &myem);
1571 /* we cant win if we dont have the recmaster capability */
1572 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1576 /* we cant win if we are banned */
1577 if (rec->node_flags & NODE_FLAGS_BANNED) {
1581 /* we cant win if we are stopped */
1582 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1586 /* we will automatically win if the other node is banned */
1587 if (em->node_flags & NODE_FLAGS_BANNED) {
1591 /* we will automatically win if the other node is banned */
1592 if (em->node_flags & NODE_FLAGS_STOPPED) {
1596 /* try to use the most connected node */
1598 cmp = (int)myem.num_connected - (int)em->num_connected;
1601 /* then the longest running node */
1603 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1607 cmp = (int)myem.pnn - (int)em->pnn;
1614 send out an election request
1616 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1619 TDB_DATA election_data;
1620 struct election_message emsg;
1622 struct ctdb_context *ctdb = rec->ctdb;
1624 srvid = CTDB_SRVID_RECOVERY;
1626 ctdb_election_data(rec, &emsg);
1628 election_data.dsize = sizeof(struct election_message);
1629 election_data.dptr = (unsigned char *)&emsg;
1632 /* send an election message to all active nodes */
1633 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1634 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1637 /* A new node that is already frozen has entered the cluster.
1638 The existing nodes are not frozen and dont need to be frozen
1639 until the election has ended and we start the actual recovery
1641 if (update_recmaster == true) {
1642 /* first we assume we will win the election and set
1643 recoverymaster to be ourself on the current node
1645 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1647 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1657 this function will unban all nodes in the cluster
1659 static void unban_all_nodes(struct ctdb_context *ctdb)
1662 struct ctdb_node_map *nodemap;
1663 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1665 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1667 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1671 for (i=0;i<nodemap->num;i++) {
1672 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1673 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1674 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1678 talloc_free(tmp_ctx);
1683 we think we are winning the election - send a broadcast election request
1685 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1687 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1690 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1692 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1695 talloc_free(rec->send_election_te);
1696 rec->send_election_te = NULL;
1700 handler for memory dumps
1702 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1703 TDB_DATA data, void *private_data)
1705 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1708 struct rd_memdump_reply *rd;
1710 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1711 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1712 talloc_free(tmp_ctx);
1715 rd = (struct rd_memdump_reply *)data.dptr;
1717 dump = talloc_zero(tmp_ctx, TDB_DATA);
1719 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1720 talloc_free(tmp_ctx);
1723 ret = ctdb_dump_memory(ctdb, dump);
1725 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1726 talloc_free(tmp_ctx);
1730 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1732 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1734 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1735 talloc_free(tmp_ctx);
1739 talloc_free(tmp_ctx);
1743 handler for reload_nodes
1745 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1746 TDB_DATA data, void *private_data)
1748 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1750 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1752 reload_nodes_file(rec->ctdb);
1756 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1757 struct timeval yt, void *p)
1759 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1761 talloc_free(rec->ip_check_disable_ctx);
1762 rec->ip_check_disable_ctx = NULL;
1765 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1766 TDB_DATA data, void *private_data)
1768 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1771 if (rec->ip_check_disable_ctx != NULL) {
1772 talloc_free(rec->ip_check_disable_ctx);
1773 rec->ip_check_disable_ctx = NULL;
1776 if (data.dsize != sizeof(uint32_t)) {
1777 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu expexting %lu\n", data.dsize, sizeof(uint32_t)));
1780 if (data.dptr == NULL) {
1781 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1785 timeout = *((uint32_t *)data.dptr);
1786 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1788 rec->ip_check_disable_ctx = talloc_new(rec);
1789 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1791 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1796 handler for ip reallocate, just add it to the list of callers and
1797 handle this later in the monitor_cluster loop so we do not recurse
1798 with other callers to takeover_run()
1800 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1801 TDB_DATA data, void *private_data)
1803 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1804 struct ip_reallocate_list *caller;
1806 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1807 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1811 if (rec->ip_reallocate_ctx == NULL) {
1812 rec->ip_reallocate_ctx = talloc_new(rec);
1813 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1816 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1817 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1819 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1820 caller->next = rec->reallocate_callers;
1821 rec->reallocate_callers = caller;
1826 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1828 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1831 struct ip_reallocate_list *callers;
1833 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1834 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1835 result.dsize = sizeof(int32_t);
1836 result.dptr = (uint8_t *)&ret;
1838 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1839 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1840 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1842 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1846 talloc_free(tmp_ctx);
1847 talloc_free(rec->ip_reallocate_ctx);
1848 rec->ip_reallocate_ctx = NULL;
1849 rec->reallocate_callers = NULL;
1855 handler for recovery master elections
1857 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1858 TDB_DATA data, void *private_data)
1860 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1862 struct election_message *em = (struct election_message *)data.dptr;
1863 TALLOC_CTX *mem_ctx;
1865 /* we got an election packet - update the timeout for the election */
1866 talloc_free(rec->election_timeout);
1867 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1868 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1869 ctdb_election_timeout, rec);
1871 mem_ctx = talloc_new(ctdb);
1873 /* someone called an election. check their election data
1874 and if we disagree and we would rather be the elected node,
1875 send a new election message to all other nodes
1877 if (ctdb_election_win(rec, em)) {
1878 if (!rec->send_election_te) {
1879 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1880 timeval_current_ofs(0, 500000),
1881 election_send_request, rec);
1883 talloc_free(mem_ctx);
1884 /*unban_all_nodes(ctdb);*/
1889 talloc_free(rec->send_election_te);
1890 rec->send_election_te = NULL;
1892 if (ctdb->tunable.verify_recovery_lock != 0) {
1893 /* release the recmaster lock */
1894 if (em->pnn != ctdb->pnn &&
1895 ctdb->recovery_lock_fd != -1) {
1896 close(ctdb->recovery_lock_fd);
1897 ctdb->recovery_lock_fd = -1;
1898 unban_all_nodes(ctdb);
1902 /* ok, let that guy become recmaster then */
1903 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1905 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1906 talloc_free(mem_ctx);
1910 talloc_free(mem_ctx);
1916 force the start of the election process
1918 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1919 struct ctdb_node_map *nodemap)
1922 struct ctdb_context *ctdb = rec->ctdb;
1924 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1926 /* set all nodes to recovery mode to stop all internode traffic */
1927 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1929 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1933 talloc_free(rec->election_timeout);
1934 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1935 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1936 ctdb_election_timeout, rec);
1938 ret = send_election_request(rec, pnn, true);
1940 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1944 /* wait for a few seconds to collect all responses */
1945 ctdb_wait_election(rec);
1951 handler for when a node changes its flags
1953 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1954 TDB_DATA data, void *private_data)
1957 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1958 struct ctdb_node_map *nodemap=NULL;
1959 TALLOC_CTX *tmp_ctx;
1960 uint32_t changed_flags;
1962 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1963 int disabled_flag_changed;
1965 if (data.dsize != sizeof(*c)) {
1966 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1970 tmp_ctx = talloc_new(ctdb);
1971 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1973 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1975 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1976 talloc_free(tmp_ctx);
1981 for (i=0;i<nodemap->num;i++) {
1982 if (nodemap->nodes[i].pnn == c->pnn) break;
1985 if (i == nodemap->num) {
1986 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1987 talloc_free(tmp_ctx);
1991 changed_flags = c->old_flags ^ c->new_flags;
1993 if (nodemap->nodes[i].flags != c->new_flags) {
1994 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1997 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
1999 nodemap->nodes[i].flags = c->new_flags;
2001 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2002 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2005 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2006 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2010 ctdb->recovery_master == ctdb->pnn &&
2011 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2012 /* Only do the takeover run if the perm disabled or unhealthy
2013 flags changed since these will cause an ip failover but not
2015 If the node became disconnected or banned this will also
2016 lead to an ip address failover but that is handled
2019 if (disabled_flag_changed) {
2020 rec->need_takeover_run = true;
2024 talloc_free(tmp_ctx);
2028 handler for when we need to push out flag changes ot all other nodes
2030 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2031 TDB_DATA data, void *private_data)
2034 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2036 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2038 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2043 struct verify_recmode_normal_data {
2045 enum monitor_result status;
2048 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2050 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2053 /* one more node has responded with recmode data*/
2056 /* if we failed to get the recmode, then return an error and let
2057 the main loop try again.
2059 if (state->state != CTDB_CONTROL_DONE) {
2060 if (rmdata->status == MONITOR_OK) {
2061 rmdata->status = MONITOR_FAILED;
2066 /* if we got a response, then the recmode will be stored in the
2069 if (state->status != CTDB_RECOVERY_NORMAL) {
2070 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2071 rmdata->status = MONITOR_RECOVERY_NEEDED;
2078 /* verify that all nodes are in normal recovery mode */
2079 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2081 struct verify_recmode_normal_data *rmdata;
2082 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2083 struct ctdb_client_control_state *state;
2084 enum monitor_result status;
2087 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2088 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2090 rmdata->status = MONITOR_OK;
2092 /* loop over all active nodes and send an async getrecmode call to
2094 for (j=0; j<nodemap->num; j++) {
2095 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2098 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2100 nodemap->nodes[j].pnn);
2101 if (state == NULL) {
2102 /* we failed to send the control, treat this as
2103 an error and try again next iteration
2105 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2106 talloc_free(mem_ctx);
2107 return MONITOR_FAILED;
2110 /* set up the callback functions */
2111 state->async.fn = verify_recmode_normal_callback;
2112 state->async.private_data = rmdata;
2114 /* one more control to wait for to complete */
2119 /* now wait for up to the maximum number of seconds allowed
2120 or until all nodes we expect a response from has replied
2122 while (rmdata->count > 0) {
2123 event_loop_once(ctdb->ev);
2126 status = rmdata->status;
2127 talloc_free(mem_ctx);
2132 struct verify_recmaster_data {
2133 struct ctdb_recoverd *rec;
2136 enum monitor_result status;
2139 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2141 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2144 /* one more node has responded with recmaster data*/
2147 /* if we failed to get the recmaster, then return an error and let
2148 the main loop try again.
2150 if (state->state != CTDB_CONTROL_DONE) {
2151 if (rmdata->status == MONITOR_OK) {
2152 rmdata->status = MONITOR_FAILED;
2157 /* if we got a response, then the recmaster will be stored in the
2160 if (state->status != rmdata->pnn) {
2161 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2162 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2163 rmdata->status = MONITOR_ELECTION_NEEDED;
2170 /* verify that all nodes agree that we are the recmaster */
2171 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2173 struct ctdb_context *ctdb = rec->ctdb;
2174 struct verify_recmaster_data *rmdata;
2175 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2176 struct ctdb_client_control_state *state;
2177 enum monitor_result status;
2180 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2181 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2185 rmdata->status = MONITOR_OK;
2187 /* loop over all active nodes and send an async getrecmaster call to
2189 for (j=0; j<nodemap->num; j++) {
2190 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2193 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2195 nodemap->nodes[j].pnn);
2196 if (state == NULL) {
2197 /* we failed to send the control, treat this as
2198 an error and try again next iteration
2200 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2201 talloc_free(mem_ctx);
2202 return MONITOR_FAILED;
2205 /* set up the callback functions */
2206 state->async.fn = verify_recmaster_callback;
2207 state->async.private_data = rmdata;
2209 /* one more control to wait for to complete */
2214 /* now wait for up to the maximum number of seconds allowed
2215 or until all nodes we expect a response from has replied
2217 while (rmdata->count > 0) {
2218 event_loop_once(ctdb->ev);
2221 status = rmdata->status;
2222 talloc_free(mem_ctx);
2227 /* called to check that the allocation of public ip addresses is ok.
2229 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2231 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2232 struct ctdb_all_public_ips *ips = NULL;
2233 struct ctdb_uptime *uptime1 = NULL;
2234 struct ctdb_uptime *uptime2 = NULL;
2237 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2238 CTDB_CURRENT_NODE, &uptime1);
2240 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2241 talloc_free(mem_ctx);
2245 /* read the ip allocation from the local node */
2246 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2248 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2249 talloc_free(mem_ctx);
2253 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2254 CTDB_CURRENT_NODE, &uptime2);
2256 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2257 talloc_free(mem_ctx);
2261 /* skip the check if the startrecovery time has changed */
2262 if (timeval_compare(&uptime1->last_recovery_started,
2263 &uptime2->last_recovery_started) != 0) {
2264 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2265 talloc_free(mem_ctx);
2269 /* skip the check if the endrecovery time has changed */
2270 if (timeval_compare(&uptime1->last_recovery_finished,
2271 &uptime2->last_recovery_finished) != 0) {
2272 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2273 talloc_free(mem_ctx);
2277 /* skip the check if we have started but not finished recovery */
2278 if (timeval_compare(&uptime1->last_recovery_finished,
2279 &uptime1->last_recovery_started) != 1) {
2280 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2281 talloc_free(mem_ctx);
2286 /* verify that we have the ip addresses we should have
2287 and we dont have ones we shouldnt have.
2288 if we find an inconsistency we set recmode to
2289 active on the local node and wait for the recmaster
2290 to do a full blown recovery
2292 for (j=0; j<ips->num; j++) {
2293 if (ips->ips[j].pnn == pnn) {
2294 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2295 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2296 ctdb_addr_to_str(&ips->ips[j].addr)));
2297 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2299 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2301 talloc_free(mem_ctx);
2304 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2306 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2308 talloc_free(mem_ctx);
2313 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2314 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2315 ctdb_addr_to_str(&ips->ips[j].addr)));
2317 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2319 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2321 talloc_free(mem_ctx);
2324 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2326 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2328 talloc_free(mem_ctx);
2335 talloc_free(mem_ctx);
2340 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2342 struct ctdb_node_map **remote_nodemaps = callback_data;
2344 if (node_pnn >= ctdb->num_nodes) {
2345 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2349 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2353 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2354 struct ctdb_node_map *nodemap,
2355 struct ctdb_node_map **remote_nodemaps)
2359 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2360 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2362 CONTROL_TIMEOUT(), false, tdb_null,
2363 async_getnodemap_callback,
2365 remote_nodemaps) != 0) {
2366 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2374 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2375 struct ctdb_check_reclock_state {
2376 struct ctdb_context *ctdb;
2377 struct timeval start_time;
2380 struct timed_event *te;
2381 struct fd_event *fde;
2382 enum reclock_child_status status;
2385 /* when we free the reclock state we must kill any child process.
2387 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2389 struct ctdb_context *ctdb = state->ctdb;
2391 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2393 if (state->fd[0] != -1) {
2394 close(state->fd[0]);
2397 if (state->fd[1] != -1) {
2398 close(state->fd[1]);
2401 kill(state->child, SIGKILL);
2406 called if our check_reclock child times out. this would happen if
2407 i/o to the reclock file blocks.
2409 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2410 struct timeval t, void *private_data)
2412 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2413 struct ctdb_check_reclock_state);
2415 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2416 state->status = RECLOCK_TIMEOUT;
2419 /* this is called when the child process has completed checking the reclock
2420 file and has written data back to us through the pipe.
2422 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2423 uint16_t flags, void *private_data)
2425 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2426 struct ctdb_check_reclock_state);
2430 /* we got a response from our child process so we can abort the
2433 talloc_free(state->te);
2436 ret = read(state->fd[0], &c, 1);
2437 if (ret != 1 || c != RECLOCK_OK) {
2438 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2439 state->status = RECLOCK_FAILED;
2444 state->status = RECLOCK_OK;
2448 static int check_recovery_lock(struct ctdb_context *ctdb)
2451 struct ctdb_check_reclock_state *state;
2452 pid_t parent = getpid();
2454 if (ctdb->recovery_lock_fd == -1) {
2455 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2459 state = talloc(ctdb, struct ctdb_check_reclock_state);
2460 CTDB_NO_MEMORY(ctdb, state);
2463 state->start_time = timeval_current();
2464 state->status = RECLOCK_CHECKING;
2468 ret = pipe(state->fd);
2471 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2475 state->child = fork();
2476 if (state->child == (pid_t)-1) {
2477 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2478 close(state->fd[0]);
2480 close(state->fd[1]);
2486 if (state->child == 0) {
2487 char cc = RECLOCK_OK;
2488 close(state->fd[0]);
2491 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2492 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2493 cc = RECLOCK_FAILED;
2496 write(state->fd[1], &cc, 1);
2497 /* make sure we die when our parent dies */
2498 while (kill(parent, 0) == 0 || errno != ESRCH) {
2500 write(state->fd[1], &cc, 1);
2504 close(state->fd[1]);
2507 talloc_set_destructor(state, check_reclock_destructor);
2509 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2510 ctdb_check_reclock_timeout, state);
2511 if (state->te == NULL) {
2512 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2517 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2518 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2519 reclock_child_handler,
2522 if (state->fde == NULL) {
2523 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2528 while (state->status == RECLOCK_CHECKING) {
2529 event_loop_once(ctdb->ev);
2532 if (state->status == RECLOCK_FAILED) {
2533 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2534 close(ctdb->recovery_lock_fd);
2535 ctdb->recovery_lock_fd = -1;
2544 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2546 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2547 const char *reclockfile;
2549 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2550 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2551 talloc_free(tmp_ctx);
2555 if (reclockfile == NULL) {
2556 if (ctdb->recovery_lock_file != NULL) {
2557 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2558 talloc_free(ctdb->recovery_lock_file);
2559 ctdb->recovery_lock_file = NULL;
2560 if (ctdb->recovery_lock_fd != -1) {
2561 close(ctdb->recovery_lock_fd);
2562 ctdb->recovery_lock_fd = -1;
2565 ctdb->tunable.verify_recovery_lock = 0;
2566 talloc_free(tmp_ctx);
2570 if (ctdb->recovery_lock_file == NULL) {
2571 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2572 if (ctdb->recovery_lock_fd != -1) {
2573 close(ctdb->recovery_lock_fd);
2574 ctdb->recovery_lock_fd = -1;
2576 talloc_free(tmp_ctx);
2581 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2582 talloc_free(tmp_ctx);
2586 talloc_free(ctdb->recovery_lock_file);
2587 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2588 ctdb->tunable.verify_recovery_lock = 0;
2589 if (ctdb->recovery_lock_fd != -1) {
2590 close(ctdb->recovery_lock_fd);
2591 ctdb->recovery_lock_fd = -1;
2594 talloc_free(tmp_ctx);
2599 the main monitoring loop
2601 static void monitor_cluster(struct ctdb_context *ctdb)
2604 TALLOC_CTX *mem_ctx=NULL;
2605 struct ctdb_node_map *nodemap=NULL;
2606 struct ctdb_node_map *recmaster_nodemap=NULL;
2607 struct ctdb_node_map **remote_nodemaps=NULL;
2608 struct ctdb_vnn_map *vnnmap=NULL;
2609 struct ctdb_vnn_map *remote_vnnmap=NULL;
2610 int32_t debug_level;
2612 struct ctdb_recoverd *rec;
2614 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2616 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2617 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2621 rec->priority_time = timeval_current();
2623 /* register a message port for sending memory dumps */
2624 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2626 /* register a message port for recovery elections */
2627 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2629 /* when nodes are disabled/enabled */
2630 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2632 /* when we are asked to puch out a flag change */
2633 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2635 /* register a message port for vacuum fetch */
2636 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2638 /* register a message port for reloadnodes */
2639 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2641 /* register a message port for performing a takeover run */
2642 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2644 /* register a message port for disabling the ip check for a short while */
2645 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2649 talloc_free(mem_ctx);
2652 mem_ctx = talloc_new(ctdb);
2654 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2658 /* we only check for recovery once every second */
2659 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2661 /* verify that the main daemon is still running */
2662 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2663 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2667 /* ping the local daemon to tell it we are alive */
2668 ctdb_ctrl_recd_ping(ctdb);
2670 if (rec->election_timeout) {
2671 /* an election is in progress */
2675 /* read the debug level from the parent and update locally */
2676 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2678 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2681 LogLevel = debug_level;
2684 /* We must check if we need to ban a node here but we want to do this
2685 as early as possible so we dont wait until we have pulled the node
2686 map from the local node. thats why we have the hardcoded value 20
2688 for (i=0; i<ctdb->num_nodes; i++) {
2689 struct ctdb_banning_state *ban_state;
2691 if (ctdb->nodes[i]->ban_state == NULL) {
2694 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2695 if (ban_state->count < 20) {
2698 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2699 ctdb->nodes[i]->pnn, ban_state->count,
2700 ctdb->tunable.recovery_ban_period));
2701 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2702 ban_state->count = 0;
2705 /* get relevant tunables */
2706 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2708 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2712 /* get the current recovery lock file from the server */
2713 if (update_recovery_lock_file(ctdb) != 0) {
2714 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2718 /* Make sure that if recovery lock verification becomes disabled when
2721 if (ctdb->tunable.verify_recovery_lock == 0) {
2722 if (ctdb->recovery_lock_fd != -1) {
2723 close(ctdb->recovery_lock_fd);
2724 ctdb->recovery_lock_fd = -1;
2728 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2729 if (pnn == (uint32_t)-1) {
2730 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2734 /* get the vnnmap */
2735 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2737 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2742 /* get number of nodes */
2744 talloc_free(rec->nodemap);
2745 rec->nodemap = NULL;
2748 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2750 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2753 nodemap = rec->nodemap;
2755 /* check which node is the recovery master */
2756 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2758 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2762 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2763 if (rec->recmaster != pnn) {
2764 if (rec->ip_reallocate_ctx != NULL) {
2765 talloc_free(rec->ip_reallocate_ctx);
2766 rec->ip_reallocate_ctx = NULL;
2767 rec->reallocate_callers = NULL;
2770 /* if there are takeovers requested, perform it and notify the waiters */
2771 if (rec->reallocate_callers) {
2772 process_ipreallocate_requests(ctdb, rec);
2775 if (rec->recmaster == (uint32_t)-1) {
2776 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2777 force_election(rec, pnn, nodemap);
2782 /* if the local daemon is STOPPED, we verify that the databases are
2783 also frozen and thet the recmode is set to active
2785 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2786 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2788 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2790 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2791 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2793 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2795 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2798 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2800 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2807 /* If the local node is stopped, verify we are not the recmaster
2808 and yield this role if so
2810 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2811 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2812 force_election(rec, pnn, nodemap);
2816 /* check that we (recovery daemon) and the local ctdb daemon
2817 agrees on whether we are banned or not
2821 /* remember our own node flags */
2822 rec->node_flags = nodemap->nodes[pnn].flags;
2824 /* count how many active nodes there are */
2825 rec->num_active = 0;
2826 rec->num_connected = 0;
2827 for (i=0; i<nodemap->num; i++) {
2828 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2831 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2832 rec->num_connected++;
2837 /* verify that the recmaster node is still active */
2838 for (j=0; j<nodemap->num; j++) {
2839 if (nodemap->nodes[j].pnn==rec->recmaster) {
2844 if (j == nodemap->num) {
2845 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2846 force_election(rec, pnn, nodemap);
2850 /* if recovery master is disconnected we must elect a new recmaster */
2851 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2852 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2853 force_election(rec, pnn, nodemap);
2857 /* grap the nodemap from the recovery master to check if it is banned */
2858 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2859 mem_ctx, &recmaster_nodemap);
2861 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2862 nodemap->nodes[j].pnn));
2867 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2868 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2869 force_election(rec, pnn, nodemap);
2874 /* verify that we have all ip addresses we should have and we dont
2875 * have addresses we shouldnt have.
2877 if (ctdb->do_checkpublicip) {
2878 if (rec->ip_check_disable_ctx == NULL) {
2879 if (verify_ip_allocation(ctdb, pnn) != 0) {
2880 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2887 /* if we are not the recmaster then we do not need to check
2888 if recovery is needed
2890 if (pnn != rec->recmaster) {
2895 /* ensure our local copies of flags are right */
2896 ret = update_local_flags(rec, nodemap);
2897 if (ret == MONITOR_ELECTION_NEEDED) {
2898 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2899 force_election(rec, pnn, nodemap);
2902 if (ret != MONITOR_OK) {
2903 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2907 /* update the list of public ips that a node can handle for
2910 if (ctdb->num_nodes != nodemap->num) {
2911 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2912 reload_nodes_file(ctdb);
2915 for (j=0; j<nodemap->num; j++) {
2916 /* release any existing data */
2917 if (ctdb->nodes[j]->public_ips) {
2918 talloc_free(ctdb->nodes[j]->public_ips);
2919 ctdb->nodes[j]->public_ips = NULL;
2922 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2926 /* grab a new shiny list of public ips from the node */
2927 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2928 ctdb->nodes[j]->pnn,
2930 &ctdb->nodes[j]->public_ips)) {
2931 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2932 ctdb->nodes[j]->pnn));
2938 /* verify that all active nodes agree that we are the recmaster */
2939 switch (verify_recmaster(rec, nodemap, pnn)) {
2940 case MONITOR_RECOVERY_NEEDED:
2941 /* can not happen */
2943 case MONITOR_ELECTION_NEEDED:
2944 force_election(rec, pnn, nodemap);
2948 case MONITOR_FAILED:
2953 if (rec->need_recovery) {
2954 /* a previous recovery didn't finish */
2955 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2959 /* verify that all active nodes are in normal mode
2960 and not in recovery mode
2962 switch (verify_recmode(ctdb, nodemap)) {
2963 case MONITOR_RECOVERY_NEEDED:
2964 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2966 case MONITOR_FAILED:
2968 case MONITOR_ELECTION_NEEDED:
2969 /* can not happen */
2975 if (ctdb->tunable.verify_recovery_lock != 0) {
2976 /* we should have the reclock - check its not stale */
2977 ret = check_recovery_lock(ctdb);
2979 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2980 ctdb_set_culprit(rec, ctdb->pnn);
2981 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2986 /* get the nodemap for all active remote nodes
2988 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
2989 if (remote_nodemaps == NULL) {
2990 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2993 for(i=0; i<nodemap->num; i++) {
2994 remote_nodemaps[i] = NULL;
2996 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2997 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3001 /* verify that all other nodes have the same nodemap as we have
3003 for (j=0; j<nodemap->num; j++) {
3004 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3008 if (remote_nodemaps[j] == NULL) {
3009 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3010 ctdb_set_culprit(rec, j);
3015 /* if the nodes disagree on how many nodes there are
3016 then this is a good reason to try recovery
3018 if (remote_nodemaps[j]->num != nodemap->num) {
3019 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3020 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3021 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3022 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3026 /* if the nodes disagree on which nodes exist and are
3027 active, then that is also a good reason to do recovery
3029 for (i=0;i<nodemap->num;i++) {
3030 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3031 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3032 nodemap->nodes[j].pnn, i,
3033 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3034 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3035 do_recovery(rec, mem_ctx, pnn, nodemap,
3041 /* verify the flags are consistent
3043 for (i=0; i<nodemap->num; i++) {
3044 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3048 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3049 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3050 nodemap->nodes[j].pnn,
3051 nodemap->nodes[i].pnn,
3052 remote_nodemaps[j]->nodes[i].flags,
3053 nodemap->nodes[j].flags));
3055 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3056 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3057 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3058 do_recovery(rec, mem_ctx, pnn, nodemap,
3062 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3063 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3064 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3065 do_recovery(rec, mem_ctx, pnn, nodemap,
3074 /* there better be the same number of lmasters in the vnn map
3075 as there are active nodes or we will have to do a recovery
3077 if (vnnmap->size != rec->num_active) {
3078 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3079 vnnmap->size, rec->num_active));
3080 ctdb_set_culprit(rec, ctdb->pnn);
3081 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3085 /* verify that all active nodes in the nodemap also exist in
3088 for (j=0; j<nodemap->num; j++) {
3089 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3092 if (nodemap->nodes[j].pnn == pnn) {
3096 for (i=0; i<vnnmap->size; i++) {
3097 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3101 if (i == vnnmap->size) {
3102 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3103 nodemap->nodes[j].pnn));
3104 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3105 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3111 /* verify that all other nodes have the same vnnmap
3112 and are from the same generation
3114 for (j=0; j<nodemap->num; j++) {
3115 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3118 if (nodemap->nodes[j].pnn == pnn) {
3122 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3123 mem_ctx, &remote_vnnmap);
3125 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3126 nodemap->nodes[j].pnn));
3130 /* verify the vnnmap generation is the same */
3131 if (vnnmap->generation != remote_vnnmap->generation) {
3132 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3133 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3134 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3135 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3139 /* verify the vnnmap size is the same */
3140 if (vnnmap->size != remote_vnnmap->size) {
3141 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3142 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3143 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3144 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3148 /* verify the vnnmap is the same */
3149 for (i=0;i<vnnmap->size;i++) {
3150 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3151 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3152 nodemap->nodes[j].pnn));
3153 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3154 do_recovery(rec, mem_ctx, pnn, nodemap,
3161 /* we might need to change who has what IP assigned */
3162 if (rec->need_takeover_run) {
3163 rec->need_takeover_run = false;
3165 /* execute the "startrecovery" event script on all nodes */
3166 ret = run_startrecovery_eventscript(rec, nodemap);
3168 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3169 ctdb_set_culprit(rec, ctdb->pnn);
3170 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3173 ret = ctdb_takeover_run(ctdb, nodemap);
3175 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3176 ctdb_set_culprit(rec, ctdb->pnn);
3177 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3180 /* execute the "recovered" event script on all nodes */
3181 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3183 // we cant check whether the event completed successfully
3184 // since this script WILL fail if the node is in recovery mode
3185 // and if that race happens, the code here would just cause a second
3186 // cascading recovery.
3188 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3189 ctdb_set_culprit(rec, ctdb->pnn);
3190 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3201 event handler for when the main ctdbd dies
3203 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3204 uint16_t flags, void *private_data)
3206 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3211 called regularly to verify that the recovery daemon is still running
3213 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3214 struct timeval yt, void *p)
3216 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3218 if (kill(ctdb->recoverd_pid, 0) != 0) {
3219 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3221 ctdb_stop_recoverd(ctdb);
3222 ctdb_stop_keepalive(ctdb);
3223 ctdb_stop_monitoring(ctdb);
3224 ctdb_release_all_ips(ctdb);
3225 if (ctdb->methods != NULL) {
3226 ctdb->methods->shutdown(ctdb);
3228 ctdb_event_script(ctdb, "shutdown");
3233 event_add_timed(ctdb->ev, ctdb,
3234 timeval_current_ofs(30, 0),
3235 ctdb_check_recd, ctdb);
3238 static void recd_sig_child_handler(struct event_context *ev,
3239 struct signal_event *se, int signum, int count,
3243 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3248 pid = waitpid(-1, &status, WNOHANG);
3250 if (errno != ECHILD) {
3251 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3256 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3262 startup the recovery daemon as a child of the main ctdb daemon
3264 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3267 struct signal_event *se;
3269 if (pipe(fd) != 0) {
3273 ctdb->ctdbd_pid = getpid();
3275 ctdb->recoverd_pid = fork();
3276 if (ctdb->recoverd_pid == -1) {
3280 if (ctdb->recoverd_pid != 0) {
3282 event_add_timed(ctdb->ev, ctdb,
3283 timeval_current_ofs(30, 0),
3284 ctdb_check_recd, ctdb);
3290 srandom(getpid() ^ time(NULL));
3292 if (switch_from_server_to_client(ctdb) != 0) {
3293 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3297 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3298 ctdb_recoverd_parent, &fd[0]);
3300 /* set up a handler to pick up sigchld */
3301 se = event_add_signal(ctdb->ev, ctdb,
3303 recd_sig_child_handler,
3306 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3310 monitor_cluster(ctdb);
3312 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3317 shutdown the recovery daemon
3319 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3321 if (ctdb->recoverd_pid == 0) {
3325 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3326 kill(ctdb->recoverd_pid, SIGTERM);