4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid,
529 struct ctdb_marshall_buffer *reply;
530 struct ctdb_rec_data *rec;
532 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
534 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
535 CONTROL_TIMEOUT(), &outdata);
537 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
538 talloc_free(tmp_ctx);
542 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
544 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
545 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
546 talloc_free(tmp_ctx);
550 rec = (struct ctdb_rec_data *)&reply->data[0];
554 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
556 struct ctdb_ltdb_header *hdr;
559 key.dptr = &rec->data[0];
560 key.dsize = rec->keylen;
561 data.dptr = &rec->data[key.dsize];
562 data.dsize = rec->datalen;
564 hdr = (struct ctdb_ltdb_header *)data.dptr;
566 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
567 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
568 talloc_free(tmp_ctx);
572 /* fetch the existing record, if any */
573 existing = tdb_fetch(recdb->tdb, key);
575 if (existing.dptr != NULL) {
576 struct ctdb_ltdb_header header;
577 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
578 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
579 (unsigned)existing.dsize, srcnode));
581 talloc_free(tmp_ctx);
584 header = *(struct ctdb_ltdb_header *)existing.dptr;
586 if (!(header.rsn < hdr->rsn ||
587 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
592 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
593 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
594 talloc_free(tmp_ctx);
599 talloc_free(tmp_ctx);
605 pull all the remote database contents into the recdb
607 static int pull_remote_database(struct ctdb_context *ctdb,
608 struct ctdb_recoverd *rec,
609 struct ctdb_node_map *nodemap,
610 struct tdb_wrap *recdb, uint32_t dbid,
615 /* pull all records from all other nodes across onto this node
616 (this merges based on rsn)
618 for (j=0; j<nodemap->num; j++) {
619 /* dont merge from nodes that are unavailable */
620 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
623 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
624 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
625 nodemap->nodes[j].pnn));
626 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
636 update flags on all active nodes
638 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
642 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
644 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
652 ensure all nodes have the same vnnmap we do
654 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
655 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
659 /* push the new vnn map out to all the nodes */
660 for (j=0; j<nodemap->num; j++) {
661 /* dont push to nodes that are unavailable */
662 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
666 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
668 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
678 struct vacuum_info *next, *prev;
679 struct ctdb_recoverd *rec;
681 struct ctdb_db_context *ctdb_db;
682 struct ctdb_marshall_buffer *recs;
683 struct ctdb_rec_data *r;
686 static void vacuum_fetch_next(struct vacuum_info *v);
689 called when a vacuum fetch has completed - just free it and do the next one
691 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
693 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
695 vacuum_fetch_next(v);
700 process the next element from the vacuum list
702 static void vacuum_fetch_next(struct vacuum_info *v)
704 struct ctdb_call call;
705 struct ctdb_rec_data *r;
707 while (v->recs->count) {
708 struct ctdb_client_call_state *state;
710 struct ctdb_ltdb_header *hdr;
713 call.call_id = CTDB_NULL_FUNC;
714 call.flags = CTDB_IMMEDIATE_MIGRATION;
717 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
720 call.key.dptr = &r->data[0];
721 call.key.dsize = r->keylen;
723 /* ensure we don't block this daemon - just skip a record if we can't get
725 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
729 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
730 if (data.dptr == NULL) {
731 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
735 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
737 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
741 hdr = (struct ctdb_ltdb_header *)data.dptr;
742 if (hdr->dmaster == v->rec->ctdb->pnn) {
743 /* its already local */
745 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
751 state = ctdb_call_send(v->ctdb_db, &call);
752 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
754 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
758 state->async.fn = vacuum_fetch_callback;
759 state->async.private_data = v;
768 destroy a vacuum info structure
770 static int vacuum_info_destructor(struct vacuum_info *v)
772 DLIST_REMOVE(v->rec->vacuum_info, v);
778 handler for vacuum fetch
780 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
781 TDB_DATA data, void *private_data)
783 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
784 struct ctdb_marshall_buffer *recs;
786 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
788 struct ctdb_dbid_map *dbmap=NULL;
789 bool persistent = false;
790 struct ctdb_db_context *ctdb_db;
791 struct ctdb_rec_data *r;
793 struct vacuum_info *v;
795 recs = (struct ctdb_marshall_buffer *)data.dptr;
796 r = (struct ctdb_rec_data *)&recs->data[0];
798 if (recs->count == 0) {
799 talloc_free(tmp_ctx);
805 for (v=rec->vacuum_info;v;v=v->next) {
806 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
807 /* we're already working on records from this node */
808 talloc_free(tmp_ctx);
813 /* work out if the database is persistent */
814 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
816 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
817 talloc_free(tmp_ctx);
821 for (i=0;i<dbmap->num;i++) {
822 if (dbmap->dbs[i].dbid == recs->db_id) {
823 persistent = dbmap->dbs[i].persistent;
827 if (i == dbmap->num) {
828 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
829 talloc_free(tmp_ctx);
833 /* find the name of this database */
834 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
835 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
836 talloc_free(tmp_ctx);
841 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
842 if (ctdb_db == NULL) {
843 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
844 talloc_free(tmp_ctx);
848 v = talloc_zero(rec, struct vacuum_info);
850 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
851 talloc_free(tmp_ctx);
856 v->srcnode = srcnode;
857 v->ctdb_db = ctdb_db;
858 v->recs = talloc_memdup(v, recs, data.dsize);
859 if (v->recs == NULL) {
860 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
862 talloc_free(tmp_ctx);
865 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
867 DLIST_ADD(rec->vacuum_info, v);
869 talloc_set_destructor(v, vacuum_info_destructor);
871 vacuum_fetch_next(v);
872 talloc_free(tmp_ctx);
877 called when ctdb_wait_timeout should finish
879 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
880 struct timeval yt, void *p)
882 uint32_t *timed_out = (uint32_t *)p;
887 wait for a given number of seconds
889 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
891 uint32_t timed_out = 0;
892 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
894 event_loop_once(ctdb->ev);
899 called when an election times out (ends)
901 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
902 struct timeval t, void *p)
904 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
905 rec->election_timeout = NULL;
907 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
912 wait for an election to finish. It finished election_timeout seconds after
913 the last election packet is received
915 static void ctdb_wait_election(struct ctdb_recoverd *rec)
917 struct ctdb_context *ctdb = rec->ctdb;
918 while (rec->election_timeout) {
919 event_loop_once(ctdb->ev);
924 Update our local flags from all remote connected nodes.
925 This is only run when we are or we belive we are the recovery master
927 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
930 struct ctdb_context *ctdb = rec->ctdb;
931 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
933 /* get the nodemap for all active remote nodes and verify
934 they are the same as for this node
936 for (j=0; j<nodemap->num; j++) {
937 struct ctdb_node_map *remote_nodemap=NULL;
940 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
943 if (nodemap->nodes[j].pnn == ctdb->pnn) {
947 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
948 mem_ctx, &remote_nodemap);
950 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
951 nodemap->nodes[j].pnn));
952 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
953 talloc_free(mem_ctx);
954 return MONITOR_FAILED;
956 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
957 /* We should tell our daemon about this so it
958 updates its flags or else we will log the same
959 message again in the next iteration of recovery.
960 Since we are the recovery master we can just as
961 well update the flags on all nodes.
963 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
965 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
969 /* Update our local copy of the flags in the recovery
972 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
973 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
974 nodemap->nodes[j].flags));
975 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
977 talloc_free(remote_nodemap);
979 talloc_free(mem_ctx);
984 /* Create a new random generation ip.
985 The generation id can not be the INVALID_GENERATION id
987 static uint32_t new_generation(void)
992 generation = random();
994 if (generation != INVALID_GENERATION) {
1004 create a temporary working database
1006 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1009 struct tdb_wrap *recdb;
1012 /* open up the temporary recovery database */
1013 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1019 tdb_flags = TDB_NOLOCK;
1020 if (!ctdb->do_setsched) {
1021 tdb_flags |= TDB_NOMMAP;
1023 tdb_flags |= TDB_DISALLOW_NESTING;
1025 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1026 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1027 if (recdb == NULL) {
1028 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1038 a traverse function for pulling all relevent records from recdb
1041 struct ctdb_context *ctdb;
1042 struct ctdb_marshall_buffer *recdata;
1048 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1050 struct recdb_data *params = (struct recdb_data *)p;
1051 struct ctdb_rec_data *rec;
1052 struct ctdb_ltdb_header *hdr;
1054 /* skip empty records */
1055 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1059 /* update the dmaster field to point to us */
1060 hdr = (struct ctdb_ltdb_header *)data.dptr;
1061 if (!params->persistent) {
1062 hdr->dmaster = params->ctdb->pnn;
1065 /* add the record to the blob ready to send to the nodes */
1066 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1068 params->failed = true;
1071 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1072 if (params->recdata == NULL) {
1073 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1074 rec->length + params->len, params->recdata->count));
1075 params->failed = true;
1078 params->recdata->count++;
1079 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1080 params->len += rec->length;
1087 push the recdb database out to all nodes
1089 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1091 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1093 struct recdb_data params;
1094 struct ctdb_marshall_buffer *recdata;
1096 TALLOC_CTX *tmp_ctx;
1099 tmp_ctx = talloc_new(ctdb);
1100 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1102 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1103 CTDB_NO_MEMORY(ctdb, recdata);
1105 recdata->db_id = dbid;
1108 params.recdata = recdata;
1109 params.len = offsetof(struct ctdb_marshall_buffer, data);
1110 params.failed = false;
1111 params.persistent = persistent;
1113 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1114 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1115 talloc_free(params.recdata);
1116 talloc_free(tmp_ctx);
1120 if (params.failed) {
1121 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1122 talloc_free(params.recdata);
1123 talloc_free(tmp_ctx);
1127 recdata = params.recdata;
1129 outdata.dptr = (void *)recdata;
1130 outdata.dsize = params.len;
1132 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1133 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1135 CONTROL_TIMEOUT(), false, outdata,
1138 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1139 talloc_free(recdata);
1140 talloc_free(tmp_ctx);
1144 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1145 dbid, recdata->count));
1147 talloc_free(recdata);
1148 talloc_free(tmp_ctx);
1155 go through a full recovery on one database
1157 static int recover_database(struct ctdb_recoverd *rec,
1158 TALLOC_CTX *mem_ctx,
1162 struct ctdb_node_map *nodemap,
1163 uint32_t transaction_id)
1165 struct tdb_wrap *recdb;
1167 struct ctdb_context *ctdb = rec->ctdb;
1169 struct ctdb_control_wipe_database w;
1172 recdb = create_recdb(ctdb, mem_ctx);
1173 if (recdb == NULL) {
1177 /* pull all remote databases onto the recdb */
1178 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1180 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1184 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1186 /* wipe all the remote databases. This is safe as we are in a transaction */
1188 w.transaction_id = transaction_id;
1190 data.dptr = (void *)&w;
1191 data.dsize = sizeof(w);
1193 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1194 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1196 CONTROL_TIMEOUT(), false, data,
1199 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1204 /* push out the correct database. This sets the dmaster and skips
1205 the empty records */
1206 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1212 /* all done with this database */
1219 reload the nodes file
1221 static void reload_nodes_file(struct ctdb_context *ctdb)
1224 ctdb_load_nodes_file(ctdb);
1229 we are the recmaster, and recovery is needed - start a recovery run
1231 static int do_recovery(struct ctdb_recoverd *rec,
1232 TALLOC_CTX *mem_ctx, uint32_t pnn,
1233 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1235 struct ctdb_context *ctdb = rec->ctdb;
1237 uint32_t generation;
1238 struct ctdb_dbid_map *dbmap;
1241 struct timeval start_time;
1243 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1245 /* if recovery fails, force it again */
1246 rec->need_recovery = true;
1248 for (i=0; i<ctdb->num_nodes; i++) {
1249 struct ctdb_banning_state *ban_state;
1251 if (ctdb->nodes[i]->ban_state == NULL) {
1254 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1255 if (ban_state->count < 2*ctdb->num_nodes) {
1258 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1259 ctdb->nodes[i]->pnn, ban_state->count,
1260 ctdb->tunable.recovery_ban_period));
1261 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1262 ban_state->count = 0;
1266 if (ctdb->tunable.verify_recovery_lock != 0) {
1267 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1268 start_time = timeval_current();
1269 if (!ctdb_recovery_lock(ctdb, true)) {
1270 ctdb_set_culprit(rec, pnn);
1271 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1274 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1275 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1278 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1280 /* get a list of all databases */
1281 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1283 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1287 /* we do the db creation before we set the recovery mode, so the freeze happens
1288 on all databases we will be dealing with. */
1290 /* verify that we have all the databases any other node has */
1291 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1293 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1297 /* verify that all other nodes have all our databases */
1298 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1300 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1303 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1305 /* update the database priority for all remote databases */
1306 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1308 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1310 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1313 /* set recovery mode to active on all nodes */
1314 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1316 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1320 /* execute the "startrecovery" event script on all nodes */
1321 ret = run_startrecovery_eventscript(rec, nodemap);
1323 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1327 /* pick a new generation number */
1328 generation = new_generation();
1330 /* change the vnnmap on this node to use the new generation
1331 number but not on any other nodes.
1332 this guarantees that if we abort the recovery prematurely
1333 for some reason (a node stops responding?)
1334 that we can just return immediately and we will reenter
1335 recovery shortly again.
1336 I.e. we deliberately leave the cluster with an inconsistent
1337 generation id to allow us to abort recovery at any stage and
1338 just restart it from scratch.
1340 vnnmap->generation = generation;
1341 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1343 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1347 data.dptr = (void *)&generation;
1348 data.dsize = sizeof(uint32_t);
1350 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1351 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1353 CONTROL_TIMEOUT(), false, data,
1355 transaction_start_fail_callback,
1357 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1358 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1360 CONTROL_TIMEOUT(), false, tdb_null,
1364 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1369 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1371 for (i=0;i<dbmap->num;i++) {
1372 ret = recover_database(rec, mem_ctx,
1374 dbmap->dbs[i].persistent,
1375 pnn, nodemap, generation);
1377 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1382 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1384 /* commit all the changes */
1385 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1387 CONTROL_TIMEOUT(), false, data,
1390 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1394 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1397 /* update the capabilities for all nodes */
1398 ret = update_capabilities(ctdb, nodemap);
1400 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1404 /* build a new vnn map with all the currently active and
1406 generation = new_generation();
1407 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1408 CTDB_NO_MEMORY(ctdb, vnnmap);
1409 vnnmap->generation = generation;
1411 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1412 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1413 for (i=j=0;i<nodemap->num;i++) {
1414 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1417 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1418 /* this node can not be an lmaster */
1419 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1424 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1425 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1426 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1429 if (vnnmap->size == 0) {
1430 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1432 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1433 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1434 vnnmap->map[0] = pnn;
1437 /* update to the new vnnmap on all nodes */
1438 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1440 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1444 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1446 /* update recmaster to point to us for all nodes */
1447 ret = set_recovery_master(ctdb, nodemap, pnn);
1449 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1453 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1456 update all nodes to have the same flags that we have
1458 for (i=0;i<nodemap->num;i++) {
1459 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1463 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1465 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1470 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1472 /* disable recovery mode */
1473 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1475 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1479 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1482 tell nodes to takeover their public IPs
1484 rec->need_takeover_run = false;
1485 ret = ctdb_takeover_run(ctdb, nodemap);
1487 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1490 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1492 /* execute the "recovered" event script on all nodes */
1493 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1495 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1499 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1501 /* send a message to all clients telling them that the cluster
1502 has been reconfigured */
1503 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1505 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1507 rec->need_recovery = false;
1509 /* we managed to complete a full recovery, make sure to forgive
1510 any past sins by the nodes that could now participate in the
1513 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1514 for (i=0;i<nodemap->num;i++) {
1515 struct ctdb_banning_state *ban_state;
1517 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1521 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1522 if (ban_state == NULL) {
1526 ban_state->count = 0;
1530 /* We just finished a recovery successfully.
1531 We now wait for rerecovery_timeout before we allow
1532 another recovery to take place.
1534 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1535 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1536 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1543 elections are won by first checking the number of connected nodes, then
1544 the priority time, then the pnn
1546 struct election_message {
1547 uint32_t num_connected;
1548 struct timeval priority_time;
1550 uint32_t node_flags;
1554 form this nodes election data
1556 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1559 struct ctdb_node_map *nodemap;
1560 struct ctdb_context *ctdb = rec->ctdb;
1564 em->pnn = rec->ctdb->pnn;
1565 em->priority_time = rec->priority_time;
1567 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1569 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1573 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1574 em->node_flags = rec->node_flags;
1576 for (i=0;i<nodemap->num;i++) {
1577 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1578 em->num_connected++;
1582 /* we shouldnt try to win this election if we cant be a recmaster */
1583 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1584 em->num_connected = 0;
1585 em->priority_time = timeval_current();
1588 talloc_free(nodemap);
1592 see if the given election data wins
1594 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1596 struct election_message myem;
1599 ctdb_election_data(rec, &myem);
1601 /* we cant win if we dont have the recmaster capability */
1602 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1606 /* we cant win if we are banned */
1607 if (rec->node_flags & NODE_FLAGS_BANNED) {
1611 /* we cant win if we are stopped */
1612 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1616 /* we will automatically win if the other node is banned */
1617 if (em->node_flags & NODE_FLAGS_BANNED) {
1621 /* we will automatically win if the other node is banned */
1622 if (em->node_flags & NODE_FLAGS_STOPPED) {
1626 /* try to use the most connected node */
1628 cmp = (int)myem.num_connected - (int)em->num_connected;
1631 /* then the longest running node */
1633 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1637 cmp = (int)myem.pnn - (int)em->pnn;
1644 send out an election request
1646 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1649 TDB_DATA election_data;
1650 struct election_message emsg;
1652 struct ctdb_context *ctdb = rec->ctdb;
1654 srvid = CTDB_SRVID_RECOVERY;
1656 ctdb_election_data(rec, &emsg);
1658 election_data.dsize = sizeof(struct election_message);
1659 election_data.dptr = (unsigned char *)&emsg;
1662 /* send an election message to all active nodes */
1663 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1664 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1667 /* A new node that is already frozen has entered the cluster.
1668 The existing nodes are not frozen and dont need to be frozen
1669 until the election has ended and we start the actual recovery
1671 if (update_recmaster == true) {
1672 /* first we assume we will win the election and set
1673 recoverymaster to be ourself on the current node
1675 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1677 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1687 this function will unban all nodes in the cluster
1689 static void unban_all_nodes(struct ctdb_context *ctdb)
1692 struct ctdb_node_map *nodemap;
1693 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1695 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1697 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1701 for (i=0;i<nodemap->num;i++) {
1702 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1703 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1704 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1708 talloc_free(tmp_ctx);
1713 we think we are winning the election - send a broadcast election request
1715 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1717 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1720 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1722 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1725 talloc_free(rec->send_election_te);
1726 rec->send_election_te = NULL;
1730 handler for memory dumps
1732 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1733 TDB_DATA data, void *private_data)
1735 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1738 struct rd_memdump_reply *rd;
1740 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1741 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1742 talloc_free(tmp_ctx);
1745 rd = (struct rd_memdump_reply *)data.dptr;
1747 dump = talloc_zero(tmp_ctx, TDB_DATA);
1749 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1750 talloc_free(tmp_ctx);
1753 ret = ctdb_dump_memory(ctdb, dump);
1755 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1756 talloc_free(tmp_ctx);
1760 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1762 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1764 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1765 talloc_free(tmp_ctx);
1769 talloc_free(tmp_ctx);
1773 handler for reload_nodes
1775 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1776 TDB_DATA data, void *private_data)
1778 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1780 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1782 reload_nodes_file(rec->ctdb);
1786 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1787 struct timeval yt, void *p)
1789 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1791 talloc_free(rec->ip_check_disable_ctx);
1792 rec->ip_check_disable_ctx = NULL;
1795 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1796 TDB_DATA data, void *private_data)
1798 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1801 if (rec->ip_check_disable_ctx != NULL) {
1802 talloc_free(rec->ip_check_disable_ctx);
1803 rec->ip_check_disable_ctx = NULL;
1806 if (data.dsize != sizeof(uint32_t)) {
1807 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1808 "expexting %lu\n", (long unsigned)data.dsize,
1809 (long unsigned)sizeof(uint32_t)));
1812 if (data.dptr == NULL) {
1813 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1817 timeout = *((uint32_t *)data.dptr);
1818 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1820 rec->ip_check_disable_ctx = talloc_new(rec);
1821 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1823 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1828 handler for ip reallocate, just add it to the list of callers and
1829 handle this later in the monitor_cluster loop so we do not recurse
1830 with other callers to takeover_run()
1832 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1833 TDB_DATA data, void *private_data)
1835 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1836 struct ip_reallocate_list *caller;
1838 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1839 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1843 if (rec->ip_reallocate_ctx == NULL) {
1844 rec->ip_reallocate_ctx = talloc_new(rec);
1845 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
1848 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1849 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1851 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1852 caller->next = rec->reallocate_callers;
1853 rec->reallocate_callers = caller;
1858 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1860 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1863 struct ip_reallocate_list *callers;
1865 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1866 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1867 result.dsize = sizeof(int32_t);
1868 result.dptr = (uint8_t *)&ret;
1870 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1872 /* Someone that sent srvid==0 does not want a reply */
1873 if (callers->rd->srvid == 0) {
1876 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
1877 "%u:%llu\n", (unsigned)callers->rd->pnn,
1878 (unsigned long long)callers->rd->srvid));
1879 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1881 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
1882 "message to %u:%llu\n",
1883 (unsigned)callers->rd->pnn,
1884 (unsigned long long)callers->rd->srvid));
1888 talloc_free(tmp_ctx);
1889 talloc_free(rec->ip_reallocate_ctx);
1890 rec->ip_reallocate_ctx = NULL;
1891 rec->reallocate_callers = NULL;
1897 handler for recovery master elections
1899 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1900 TDB_DATA data, void *private_data)
1902 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1904 struct election_message *em = (struct election_message *)data.dptr;
1905 TALLOC_CTX *mem_ctx;
1907 /* we got an election packet - update the timeout for the election */
1908 talloc_free(rec->election_timeout);
1909 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1910 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1911 ctdb_election_timeout, rec);
1913 mem_ctx = talloc_new(ctdb);
1915 /* someone called an election. check their election data
1916 and if we disagree and we would rather be the elected node,
1917 send a new election message to all other nodes
1919 if (ctdb_election_win(rec, em)) {
1920 if (!rec->send_election_te) {
1921 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1922 timeval_current_ofs(0, 500000),
1923 election_send_request, rec);
1925 talloc_free(mem_ctx);
1926 /*unban_all_nodes(ctdb);*/
1931 talloc_free(rec->send_election_te);
1932 rec->send_election_te = NULL;
1934 if (ctdb->tunable.verify_recovery_lock != 0) {
1935 /* release the recmaster lock */
1936 if (em->pnn != ctdb->pnn &&
1937 ctdb->recovery_lock_fd != -1) {
1938 close(ctdb->recovery_lock_fd);
1939 ctdb->recovery_lock_fd = -1;
1940 unban_all_nodes(ctdb);
1944 /* ok, let that guy become recmaster then */
1945 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1947 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1948 talloc_free(mem_ctx);
1952 talloc_free(mem_ctx);
1958 force the start of the election process
1960 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1961 struct ctdb_node_map *nodemap)
1964 struct ctdb_context *ctdb = rec->ctdb;
1966 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1968 /* set all nodes to recovery mode to stop all internode traffic */
1969 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1971 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1975 talloc_free(rec->election_timeout);
1976 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1977 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1978 ctdb_election_timeout, rec);
1980 ret = send_election_request(rec, pnn, true);
1982 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1986 /* wait for a few seconds to collect all responses */
1987 ctdb_wait_election(rec);
1993 handler for when a node changes its flags
1995 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1996 TDB_DATA data, void *private_data)
1999 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2000 struct ctdb_node_map *nodemap=NULL;
2001 TALLOC_CTX *tmp_ctx;
2002 uint32_t changed_flags;
2004 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2005 int disabled_flag_changed;
2007 if (data.dsize != sizeof(*c)) {
2008 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2012 tmp_ctx = talloc_new(ctdb);
2013 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2015 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2017 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2018 talloc_free(tmp_ctx);
2023 for (i=0;i<nodemap->num;i++) {
2024 if (nodemap->nodes[i].pnn == c->pnn) break;
2027 if (i == nodemap->num) {
2028 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2029 talloc_free(tmp_ctx);
2033 changed_flags = c->old_flags ^ c->new_flags;
2035 if (nodemap->nodes[i].flags != c->new_flags) {
2036 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2039 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2041 nodemap->nodes[i].flags = c->new_flags;
2043 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2044 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2047 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2048 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2052 ctdb->recovery_master == ctdb->pnn &&
2053 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2054 /* Only do the takeover run if the perm disabled or unhealthy
2055 flags changed since these will cause an ip failover but not
2057 If the node became disconnected or banned this will also
2058 lead to an ip address failover but that is handled
2061 if (disabled_flag_changed) {
2062 rec->need_takeover_run = true;
2066 talloc_free(tmp_ctx);
2070 handler for when we need to push out flag changes ot all other nodes
2072 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2073 TDB_DATA data, void *private_data)
2076 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2077 struct ctdb_node_map *nodemap=NULL;
2078 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2082 /* find the recovery master */
2083 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2085 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2086 talloc_free(tmp_ctx);
2090 /* read the node flags from the recmaster */
2091 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2093 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2094 talloc_free(tmp_ctx);
2097 if (c->pnn >= nodemap->num) {
2098 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2099 talloc_free(tmp_ctx);
2103 /* send the flags update to all connected nodes */
2104 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2106 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2107 nodes, 0, CONTROL_TIMEOUT(),
2111 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2113 talloc_free(tmp_ctx);
2117 talloc_free(tmp_ctx);
2121 struct verify_recmode_normal_data {
2123 enum monitor_result status;
2126 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2128 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2131 /* one more node has responded with recmode data*/
2134 /* if we failed to get the recmode, then return an error and let
2135 the main loop try again.
2137 if (state->state != CTDB_CONTROL_DONE) {
2138 if (rmdata->status == MONITOR_OK) {
2139 rmdata->status = MONITOR_FAILED;
2144 /* if we got a response, then the recmode will be stored in the
2147 if (state->status != CTDB_RECOVERY_NORMAL) {
2148 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2149 rmdata->status = MONITOR_RECOVERY_NEEDED;
2156 /* verify that all nodes are in normal recovery mode */
2157 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2159 struct verify_recmode_normal_data *rmdata;
2160 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2161 struct ctdb_client_control_state *state;
2162 enum monitor_result status;
2165 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2166 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2168 rmdata->status = MONITOR_OK;
2170 /* loop over all active nodes and send an async getrecmode call to
2172 for (j=0; j<nodemap->num; j++) {
2173 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2176 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2178 nodemap->nodes[j].pnn);
2179 if (state == NULL) {
2180 /* we failed to send the control, treat this as
2181 an error and try again next iteration
2183 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2184 talloc_free(mem_ctx);
2185 return MONITOR_FAILED;
2188 /* set up the callback functions */
2189 state->async.fn = verify_recmode_normal_callback;
2190 state->async.private_data = rmdata;
2192 /* one more control to wait for to complete */
2197 /* now wait for up to the maximum number of seconds allowed
2198 or until all nodes we expect a response from has replied
2200 while (rmdata->count > 0) {
2201 event_loop_once(ctdb->ev);
2204 status = rmdata->status;
2205 talloc_free(mem_ctx);
2210 struct verify_recmaster_data {
2211 struct ctdb_recoverd *rec;
2214 enum monitor_result status;
2217 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2219 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2222 /* one more node has responded with recmaster data*/
2225 /* if we failed to get the recmaster, then return an error and let
2226 the main loop try again.
2228 if (state->state != CTDB_CONTROL_DONE) {
2229 if (rmdata->status == MONITOR_OK) {
2230 rmdata->status = MONITOR_FAILED;
2235 /* if we got a response, then the recmaster will be stored in the
2238 if (state->status != rmdata->pnn) {
2239 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2240 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2241 rmdata->status = MONITOR_ELECTION_NEEDED;
2248 /* verify that all nodes agree that we are the recmaster */
2249 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2251 struct ctdb_context *ctdb = rec->ctdb;
2252 struct verify_recmaster_data *rmdata;
2253 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2254 struct ctdb_client_control_state *state;
2255 enum monitor_result status;
2258 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2259 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2263 rmdata->status = MONITOR_OK;
2265 /* loop over all active nodes and send an async getrecmaster call to
2267 for (j=0; j<nodemap->num; j++) {
2268 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2271 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2273 nodemap->nodes[j].pnn);
2274 if (state == NULL) {
2275 /* we failed to send the control, treat this as
2276 an error and try again next iteration
2278 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2279 talloc_free(mem_ctx);
2280 return MONITOR_FAILED;
2283 /* set up the callback functions */
2284 state->async.fn = verify_recmaster_callback;
2285 state->async.private_data = rmdata;
2287 /* one more control to wait for to complete */
2292 /* now wait for up to the maximum number of seconds allowed
2293 or until all nodes we expect a response from has replied
2295 while (rmdata->count > 0) {
2296 event_loop_once(ctdb->ev);
2299 status = rmdata->status;
2300 talloc_free(mem_ctx);
2305 /* called to check that the allocation of public ip addresses is ok.
2307 static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
2309 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2310 struct ctdb_all_public_ips *ips = NULL;
2311 struct ctdb_uptime *uptime1 = NULL;
2312 struct ctdb_uptime *uptime2 = NULL;
2315 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2316 CTDB_CURRENT_NODE, &uptime1);
2318 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2319 talloc_free(mem_ctx);
2323 /* read the ip allocation from the local node */
2324 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2326 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2327 talloc_free(mem_ctx);
2331 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2332 CTDB_CURRENT_NODE, &uptime2);
2334 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2335 talloc_free(mem_ctx);
2339 /* skip the check if the startrecovery time has changed */
2340 if (timeval_compare(&uptime1->last_recovery_started,
2341 &uptime2->last_recovery_started) != 0) {
2342 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2343 talloc_free(mem_ctx);
2347 /* skip the check if the endrecovery time has changed */
2348 if (timeval_compare(&uptime1->last_recovery_finished,
2349 &uptime2->last_recovery_finished) != 0) {
2350 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2351 talloc_free(mem_ctx);
2355 /* skip the check if we have started but not finished recovery */
2356 if (timeval_compare(&uptime1->last_recovery_finished,
2357 &uptime1->last_recovery_started) != 1) {
2358 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2359 talloc_free(mem_ctx);
2364 /* verify that we have the ip addresses we should have
2365 and we dont have ones we shouldnt have.
2366 if we find an inconsistency we set recmode to
2367 active on the local node and wait for the recmaster
2368 to do a full blown recovery
2370 for (j=0; j<ips->num; j++) {
2371 if (ips->ips[j].pnn == pnn) {
2372 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2373 struct takeover_run_reply rd;
2376 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2377 ctdb_addr_to_str(&ips->ips[j].addr)));
2381 data.dptr = (uint8_t *)&rd;
2382 data.dsize = sizeof(rd);
2384 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2386 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2390 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2391 struct takeover_run_reply rd;
2394 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2395 ctdb_addr_to_str(&ips->ips[j].addr)));
2399 data.dptr = (uint8_t *)&rd;
2400 data.dsize = sizeof(rd);
2402 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2404 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2410 talloc_free(mem_ctx);
2415 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2417 struct ctdb_node_map **remote_nodemaps = callback_data;
2419 if (node_pnn >= ctdb->num_nodes) {
2420 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2424 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2428 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2429 struct ctdb_node_map *nodemap,
2430 struct ctdb_node_map **remote_nodemaps)
2434 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2435 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2437 CONTROL_TIMEOUT(), false, tdb_null,
2438 async_getnodemap_callback,
2440 remote_nodemaps) != 0) {
2441 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2449 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2450 struct ctdb_check_reclock_state {
2451 struct ctdb_context *ctdb;
2452 struct timeval start_time;
2455 struct timed_event *te;
2456 struct fd_event *fde;
2457 enum reclock_child_status status;
2460 /* when we free the reclock state we must kill any child process.
2462 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2464 struct ctdb_context *ctdb = state->ctdb;
2466 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2468 if (state->fd[0] != -1) {
2469 close(state->fd[0]);
2472 if (state->fd[1] != -1) {
2473 close(state->fd[1]);
2476 kill(state->child, SIGKILL);
2481 called if our check_reclock child times out. this would happen if
2482 i/o to the reclock file blocks.
2484 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2485 struct timeval t, void *private_data)
2487 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2488 struct ctdb_check_reclock_state);
2490 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2491 state->status = RECLOCK_TIMEOUT;
2494 /* this is called when the child process has completed checking the reclock
2495 file and has written data back to us through the pipe.
2497 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2498 uint16_t flags, void *private_data)
2500 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2501 struct ctdb_check_reclock_state);
2505 /* we got a response from our child process so we can abort the
2508 talloc_free(state->te);
2511 ret = read(state->fd[0], &c, 1);
2512 if (ret != 1 || c != RECLOCK_OK) {
2513 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2514 state->status = RECLOCK_FAILED;
2519 state->status = RECLOCK_OK;
2523 static int check_recovery_lock(struct ctdb_context *ctdb)
2526 struct ctdb_check_reclock_state *state;
2527 pid_t parent = getpid();
2529 if (ctdb->recovery_lock_fd == -1) {
2530 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2534 state = talloc(ctdb, struct ctdb_check_reclock_state);
2535 CTDB_NO_MEMORY(ctdb, state);
2538 state->start_time = timeval_current();
2539 state->status = RECLOCK_CHECKING;
2543 ret = pipe(state->fd);
2546 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2550 state->child = fork();
2551 if (state->child == (pid_t)-1) {
2552 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2553 close(state->fd[0]);
2555 close(state->fd[1]);
2561 if (state->child == 0) {
2562 char cc = RECLOCK_OK;
2563 close(state->fd[0]);
2566 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2567 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2568 cc = RECLOCK_FAILED;
2571 write(state->fd[1], &cc, 1);
2572 /* make sure we die when our parent dies */
2573 while (kill(parent, 0) == 0 || errno != ESRCH) {
2575 write(state->fd[1], &cc, 1);
2579 close(state->fd[1]);
2581 set_close_on_exec(state->fd[0]);
2583 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2585 talloc_set_destructor(state, check_reclock_destructor);
2587 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2588 ctdb_check_reclock_timeout, state);
2589 if (state->te == NULL) {
2590 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2595 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2596 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2597 reclock_child_handler,
2600 if (state->fde == NULL) {
2601 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2606 while (state->status == RECLOCK_CHECKING) {
2607 event_loop_once(ctdb->ev);
2610 if (state->status == RECLOCK_FAILED) {
2611 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2612 close(ctdb->recovery_lock_fd);
2613 ctdb->recovery_lock_fd = -1;
2622 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2624 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2625 const char *reclockfile;
2627 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2628 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2629 talloc_free(tmp_ctx);
2633 if (reclockfile == NULL) {
2634 if (ctdb->recovery_lock_file != NULL) {
2635 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2636 talloc_free(ctdb->recovery_lock_file);
2637 ctdb->recovery_lock_file = NULL;
2638 if (ctdb->recovery_lock_fd != -1) {
2639 close(ctdb->recovery_lock_fd);
2640 ctdb->recovery_lock_fd = -1;
2643 ctdb->tunable.verify_recovery_lock = 0;
2644 talloc_free(tmp_ctx);
2648 if (ctdb->recovery_lock_file == NULL) {
2649 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2650 if (ctdb->recovery_lock_fd != -1) {
2651 close(ctdb->recovery_lock_fd);
2652 ctdb->recovery_lock_fd = -1;
2654 talloc_free(tmp_ctx);
2659 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2660 talloc_free(tmp_ctx);
2664 talloc_free(ctdb->recovery_lock_file);
2665 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2666 ctdb->tunable.verify_recovery_lock = 0;
2667 if (ctdb->recovery_lock_fd != -1) {
2668 close(ctdb->recovery_lock_fd);
2669 ctdb->recovery_lock_fd = -1;
2672 talloc_free(tmp_ctx);
2677 the main monitoring loop
2679 static void monitor_cluster(struct ctdb_context *ctdb)
2682 TALLOC_CTX *mem_ctx=NULL;
2683 struct ctdb_node_map *nodemap=NULL;
2684 struct ctdb_node_map *recmaster_nodemap=NULL;
2685 struct ctdb_node_map **remote_nodemaps=NULL;
2686 struct ctdb_vnn_map *vnnmap=NULL;
2687 struct ctdb_vnn_map *remote_vnnmap=NULL;
2688 int32_t debug_level;
2690 struct ctdb_recoverd *rec;
2692 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2694 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2695 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2699 rec->priority_time = timeval_current();
2701 /* register a message port for sending memory dumps */
2702 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2704 /* register a message port for recovery elections */
2705 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2707 /* when nodes are disabled/enabled */
2708 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2710 /* when we are asked to puch out a flag change */
2711 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2713 /* register a message port for vacuum fetch */
2714 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2716 /* register a message port for reloadnodes */
2717 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2719 /* register a message port for performing a takeover run */
2720 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2722 /* register a message port for disabling the ip check for a short while */
2723 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2727 talloc_free(mem_ctx);
2730 mem_ctx = talloc_new(ctdb);
2732 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2736 /* we only check for recovery once every second */
2737 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2739 /* verify that the main daemon is still running */
2740 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2741 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2745 /* ping the local daemon to tell it we are alive */
2746 ctdb_ctrl_recd_ping(ctdb);
2748 if (rec->election_timeout) {
2749 /* an election is in progress */
2753 /* read the debug level from the parent and update locally */
2754 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2756 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2759 LogLevel = debug_level;
2762 /* We must check if we need to ban a node here but we want to do this
2763 as early as possible so we dont wait until we have pulled the node
2764 map from the local node. thats why we have the hardcoded value 20
2766 for (i=0; i<ctdb->num_nodes; i++) {
2767 struct ctdb_banning_state *ban_state;
2769 if (ctdb->nodes[i]->ban_state == NULL) {
2772 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2773 if (ban_state->count < 20) {
2776 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2777 ctdb->nodes[i]->pnn, ban_state->count,
2778 ctdb->tunable.recovery_ban_period));
2779 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2780 ban_state->count = 0;
2783 /* get relevant tunables */
2784 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2786 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2790 /* get the current recovery lock file from the server */
2791 if (update_recovery_lock_file(ctdb) != 0) {
2792 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2796 /* Make sure that if recovery lock verification becomes disabled when
2799 if (ctdb->tunable.verify_recovery_lock == 0) {
2800 if (ctdb->recovery_lock_fd != -1) {
2801 close(ctdb->recovery_lock_fd);
2802 ctdb->recovery_lock_fd = -1;
2806 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2807 if (pnn == (uint32_t)-1) {
2808 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2812 /* get the vnnmap */
2813 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2815 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2820 /* get number of nodes */
2822 talloc_free(rec->nodemap);
2823 rec->nodemap = NULL;
2826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2828 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2831 nodemap = rec->nodemap;
2833 /* check which node is the recovery master */
2834 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2836 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2840 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2841 if (rec->recmaster != pnn) {
2842 if (rec->ip_reallocate_ctx != NULL) {
2843 talloc_free(rec->ip_reallocate_ctx);
2844 rec->ip_reallocate_ctx = NULL;
2845 rec->reallocate_callers = NULL;
2848 /* if there are takeovers requested, perform it and notify the waiters */
2849 if (rec->reallocate_callers) {
2850 process_ipreallocate_requests(ctdb, rec);
2853 if (rec->recmaster == (uint32_t)-1) {
2854 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2855 force_election(rec, pnn, nodemap);
2860 /* if the local daemon is STOPPED, we verify that the databases are
2861 also frozen and thet the recmode is set to active
2863 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2864 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2866 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2868 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2869 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2871 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2873 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2876 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2878 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2885 /* If the local node is stopped, verify we are not the recmaster
2886 and yield this role if so
2888 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2889 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2890 force_election(rec, pnn, nodemap);
2894 /* check that we (recovery daemon) and the local ctdb daemon
2895 agrees on whether we are banned or not
2899 /* remember our own node flags */
2900 rec->node_flags = nodemap->nodes[pnn].flags;
2902 /* count how many active nodes there are */
2903 rec->num_active = 0;
2904 rec->num_connected = 0;
2905 for (i=0; i<nodemap->num; i++) {
2906 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2909 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2910 rec->num_connected++;
2915 /* verify that the recmaster node is still active */
2916 for (j=0; j<nodemap->num; j++) {
2917 if (nodemap->nodes[j].pnn==rec->recmaster) {
2922 if (j == nodemap->num) {
2923 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2924 force_election(rec, pnn, nodemap);
2928 /* if recovery master is disconnected we must elect a new recmaster */
2929 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2930 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2931 force_election(rec, pnn, nodemap);
2935 /* grap the nodemap from the recovery master to check if it is banned */
2936 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2937 mem_ctx, &recmaster_nodemap);
2939 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2940 nodemap->nodes[j].pnn));
2945 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2946 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2947 force_election(rec, pnn, nodemap);
2952 /* verify that we have all ip addresses we should have and we dont
2953 * have addresses we shouldnt have.
2955 if (ctdb->do_checkpublicip) {
2956 if (rec->ip_check_disable_ctx == NULL) {
2957 if (verify_ip_allocation(ctdb, rec, pnn) != 0) {
2958 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2964 /* if we are not the recmaster then we do not need to check
2965 if recovery is needed
2967 if (pnn != rec->recmaster) {
2972 /* ensure our local copies of flags are right */
2973 ret = update_local_flags(rec, nodemap);
2974 if (ret == MONITOR_ELECTION_NEEDED) {
2975 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2976 force_election(rec, pnn, nodemap);
2979 if (ret != MONITOR_OK) {
2980 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2984 /* update the list of public ips that a node can handle for
2987 if (ctdb->num_nodes != nodemap->num) {
2988 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2989 reload_nodes_file(ctdb);
2992 for (j=0; j<nodemap->num; j++) {
2993 /* release any existing data */
2994 if (ctdb->nodes[j]->public_ips) {
2995 talloc_free(ctdb->nodes[j]->public_ips);
2996 ctdb->nodes[j]->public_ips = NULL;
2999 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3003 /* grab a new shiny list of public ips from the node */
3004 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
3005 ctdb->nodes[j]->pnn,
3007 &ctdb->nodes[j]->public_ips)) {
3008 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
3009 ctdb->nodes[j]->pnn));
3015 /* verify that all active nodes agree that we are the recmaster */
3016 switch (verify_recmaster(rec, nodemap, pnn)) {
3017 case MONITOR_RECOVERY_NEEDED:
3018 /* can not happen */
3020 case MONITOR_ELECTION_NEEDED:
3021 force_election(rec, pnn, nodemap);
3025 case MONITOR_FAILED:
3030 if (rec->need_recovery) {
3031 /* a previous recovery didn't finish */
3032 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3036 /* verify that all active nodes are in normal mode
3037 and not in recovery mode
3039 switch (verify_recmode(ctdb, nodemap)) {
3040 case MONITOR_RECOVERY_NEEDED:
3041 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3043 case MONITOR_FAILED:
3045 case MONITOR_ELECTION_NEEDED:
3046 /* can not happen */
3052 if (ctdb->tunable.verify_recovery_lock != 0) {
3053 /* we should have the reclock - check its not stale */
3054 ret = check_recovery_lock(ctdb);
3056 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3057 ctdb_set_culprit(rec, ctdb->pnn);
3058 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3063 /* get the nodemap for all active remote nodes
3065 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3066 if (remote_nodemaps == NULL) {
3067 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3070 for(i=0; i<nodemap->num; i++) {
3071 remote_nodemaps[i] = NULL;
3073 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3074 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3078 /* verify that all other nodes have the same nodemap as we have
3080 for (j=0; j<nodemap->num; j++) {
3081 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3085 if (remote_nodemaps[j] == NULL) {
3086 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3087 ctdb_set_culprit(rec, j);
3092 /* if the nodes disagree on how many nodes there are
3093 then this is a good reason to try recovery
3095 if (remote_nodemaps[j]->num != nodemap->num) {
3096 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3097 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3098 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3099 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3103 /* if the nodes disagree on which nodes exist and are
3104 active, then that is also a good reason to do recovery
3106 for (i=0;i<nodemap->num;i++) {
3107 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3108 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3109 nodemap->nodes[j].pnn, i,
3110 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3111 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3112 do_recovery(rec, mem_ctx, pnn, nodemap,
3118 /* verify the flags are consistent
3120 for (i=0; i<nodemap->num; i++) {
3121 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3125 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3126 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3127 nodemap->nodes[j].pnn,
3128 nodemap->nodes[i].pnn,
3129 remote_nodemaps[j]->nodes[i].flags,
3130 nodemap->nodes[j].flags));
3132 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3133 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3134 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3135 do_recovery(rec, mem_ctx, pnn, nodemap,
3139 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3140 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3141 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3142 do_recovery(rec, mem_ctx, pnn, nodemap,
3151 /* there better be the same number of lmasters in the vnn map
3152 as there are active nodes or we will have to do a recovery
3154 if (vnnmap->size != rec->num_active) {
3155 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3156 vnnmap->size, rec->num_active));
3157 ctdb_set_culprit(rec, ctdb->pnn);
3158 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3162 /* verify that all active nodes in the nodemap also exist in
3165 for (j=0; j<nodemap->num; j++) {
3166 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3169 if (nodemap->nodes[j].pnn == pnn) {
3173 for (i=0; i<vnnmap->size; i++) {
3174 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3178 if (i == vnnmap->size) {
3179 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3180 nodemap->nodes[j].pnn));
3181 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3182 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3188 /* verify that all other nodes have the same vnnmap
3189 and are from the same generation
3191 for (j=0; j<nodemap->num; j++) {
3192 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3195 if (nodemap->nodes[j].pnn == pnn) {
3199 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3200 mem_ctx, &remote_vnnmap);
3202 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3203 nodemap->nodes[j].pnn));
3207 /* verify the vnnmap generation is the same */
3208 if (vnnmap->generation != remote_vnnmap->generation) {
3209 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3210 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3211 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3212 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3216 /* verify the vnnmap size is the same */
3217 if (vnnmap->size != remote_vnnmap->size) {
3218 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3219 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3220 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3221 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3225 /* verify the vnnmap is the same */
3226 for (i=0;i<vnnmap->size;i++) {
3227 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3228 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3229 nodemap->nodes[j].pnn));
3230 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3231 do_recovery(rec, mem_ctx, pnn, nodemap,
3238 /* we might need to change who has what IP assigned */
3239 if (rec->need_takeover_run) {
3240 rec->need_takeover_run = false;
3242 /* execute the "startrecovery" event script on all nodes */
3243 ret = run_startrecovery_eventscript(rec, nodemap);
3245 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3246 ctdb_set_culprit(rec, ctdb->pnn);
3247 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3250 ret = ctdb_takeover_run(ctdb, nodemap);
3252 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3253 ctdb_set_culprit(rec, ctdb->pnn);
3254 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3257 /* execute the "recovered" event script on all nodes */
3258 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3260 // we cant check whether the event completed successfully
3261 // since this script WILL fail if the node is in recovery mode
3262 // and if that race happens, the code here would just cause a second
3263 // cascading recovery.
3265 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3266 ctdb_set_culprit(rec, ctdb->pnn);
3267 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3278 event handler for when the main ctdbd dies
3280 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3281 uint16_t flags, void *private_data)
3283 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3288 called regularly to verify that the recovery daemon is still running
3290 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3291 struct timeval yt, void *p)
3293 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3295 if (kill(ctdb->recoverd_pid, 0) != 0) {
3296 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3298 ctdb_stop_recoverd(ctdb);
3299 ctdb_stop_keepalive(ctdb);
3300 ctdb_stop_monitoring(ctdb);
3301 ctdb_release_all_ips(ctdb);
3302 if (ctdb->methods != NULL) {
3303 ctdb->methods->shutdown(ctdb);
3305 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3310 event_add_timed(ctdb->ev, ctdb,
3311 timeval_current_ofs(30, 0),
3312 ctdb_check_recd, ctdb);
3315 static void recd_sig_child_handler(struct event_context *ev,
3316 struct signal_event *se, int signum, int count,
3320 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3325 pid = waitpid(-1, &status, WNOHANG);
3327 if (errno != ECHILD) {
3328 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3333 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3339 startup the recovery daemon as a child of the main ctdb daemon
3341 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3344 struct signal_event *se;
3346 if (pipe(fd) != 0) {
3350 ctdb->ctdbd_pid = getpid();
3352 ctdb->recoverd_pid = fork();
3353 if (ctdb->recoverd_pid == -1) {
3357 if (ctdb->recoverd_pid != 0) {
3359 event_add_timed(ctdb->ev, ctdb,
3360 timeval_current_ofs(30, 0),
3361 ctdb_check_recd, ctdb);
3367 srandom(getpid() ^ time(NULL));
3369 if (switch_from_server_to_client(ctdb) != 0) {
3370 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3374 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3376 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3377 ctdb_recoverd_parent, &fd[0]);
3379 /* set up a handler to pick up sigchld */
3380 se = event_add_signal(ctdb->ev, ctdb,
3382 recd_sig_child_handler,
3385 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3389 monitor_cluster(ctdb);
3391 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3396 shutdown the recovery daemon
3398 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3400 if (ctdb->recoverd_pid == 0) {
3404 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3405 kill(ctdb->recoverd_pid, SIGTERM);