4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
73 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
76 ban a node for a period of time
78 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
81 struct ctdb_context *ctdb = rec->ctdb;
82 struct ctdb_ban_time bantime;
84 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
86 if (!ctdb_validate_pnn(ctdb, pnn)) {
87 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
92 bantime.time = ban_time;
94 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
96 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
102 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
106 run the "recovered" eventscript on all nodes
108 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
113 tmp_ctx = talloc_new(ctdb);
114 CTDB_NO_MEMORY(ctdb, tmp_ctx);
116 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
117 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
119 CONTROL_TIMEOUT(), false, tdb_null,
122 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
124 talloc_free(tmp_ctx);
128 talloc_free(tmp_ctx);
133 remember the trouble maker
135 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
137 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
138 struct ctdb_banning_state *ban_state;
140 if (culprit > ctdb->num_nodes) {
141 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
145 if (ctdb->nodes[culprit]->ban_state == NULL) {
146 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
147 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
151 ban_state = ctdb->nodes[culprit]->ban_state;
152 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
153 /* this was the first time in a long while this node
154 misbehaved so we will forgive any old transgressions.
156 ban_state->count = 0;
159 ban_state->count += count;
160 ban_state->last_reported_time = timeval_current();
161 rec->last_culprit_node = culprit;
165 remember the trouble maker
167 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
169 ctdb_set_culprit_count(rec, culprit, 1);
173 /* this callback is called for every node that failed to execute the
176 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
178 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
180 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
182 ctdb_set_culprit(rec, node_pnn);
186 run the "startrecovery" eventscript on all nodes
188 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
192 struct ctdb_context *ctdb = rec->ctdb;
194 tmp_ctx = talloc_new(ctdb);
195 CTDB_NO_MEMORY(ctdb, tmp_ctx);
197 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
198 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
200 CONTROL_TIMEOUT(), false, tdb_null,
202 startrecovery_fail_callback,
204 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
205 talloc_free(tmp_ctx);
209 talloc_free(tmp_ctx);
213 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
215 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
216 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
219 if (node_pnn < ctdb->num_nodes) {
220 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
225 update the node capabilities for all connected nodes
227 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
232 tmp_ctx = talloc_new(ctdb);
233 CTDB_NO_MEMORY(ctdb, tmp_ctx);
235 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
236 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
240 async_getcap_callback, NULL,
242 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
243 talloc_free(tmp_ctx);
247 talloc_free(tmp_ctx);
251 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
253 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
255 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
256 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
259 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
261 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
263 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
264 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
268 change recovery mode on all nodes
270 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
276 tmp_ctx = talloc_new(ctdb);
277 CTDB_NO_MEMORY(ctdb, tmp_ctx);
279 /* freeze all nodes */
280 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
281 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
284 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
285 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
290 set_recmode_fail_callback,
292 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
293 talloc_free(tmp_ctx);
300 data.dsize = sizeof(uint32_t);
301 data.dptr = (unsigned char *)&rec_mode;
303 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
309 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
310 talloc_free(tmp_ctx);
314 talloc_free(tmp_ctx);
319 change recovery master on all node
321 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
327 tmp_ctx = talloc_new(ctdb);
328 CTDB_NO_MEMORY(ctdb, tmp_ctx);
330 data.dsize = sizeof(uint32_t);
331 data.dptr = (unsigned char *)&pnn;
333 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
334 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
336 CONTROL_TIMEOUT(), false, data,
339 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
340 talloc_free(tmp_ctx);
344 talloc_free(tmp_ctx);
348 /* update all remote nodes to use the same db priority that we have
349 this can fail if the remove node has not yet been upgraded to
350 support this function, so we always return success and never fail
351 a recovery if this call fails.
353 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
354 struct ctdb_node_map *nodemap,
355 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
360 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
362 /* step through all local databases */
363 for (db=0; db<dbmap->num;db++) {
365 struct ctdb_db_priority db_prio;
368 db_prio.db_id = dbmap->dbs[db].dbid;
369 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
371 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
375 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
377 data.dptr = (uint8_t *)&db_prio;
378 data.dsize = sizeof(db_prio);
380 if (ctdb_client_async_control(ctdb,
381 CTDB_CONTROL_SET_DB_PRIORITY,
383 CONTROL_TIMEOUT(), false, data,
386 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
394 ensure all other nodes have attached to any databases that we have
396 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
397 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
400 struct ctdb_dbid_map *remote_dbmap;
402 /* verify that all other nodes have all our databases */
403 for (j=0; j<nodemap->num; j++) {
404 /* we dont need to ourself ourselves */
405 if (nodemap->nodes[j].pnn == pnn) {
408 /* dont check nodes that are unavailable */
409 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
413 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
414 mem_ctx, &remote_dbmap);
416 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
420 /* step through all local databases */
421 for (db=0; db<dbmap->num;db++) {
425 for (i=0;i<remote_dbmap->num;i++) {
426 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
430 /* the remote node already have this database */
431 if (i!=remote_dbmap->num) {
434 /* ok so we need to create this database */
435 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
438 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
441 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
442 mem_ctx, name, dbmap->dbs[db].persistent);
444 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
455 ensure we are attached to any databases that anyone else is attached to
457 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
458 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
461 struct ctdb_dbid_map *remote_dbmap;
463 /* verify that we have all database any other node has */
464 for (j=0; j<nodemap->num; j++) {
465 /* we dont need to ourself ourselves */
466 if (nodemap->nodes[j].pnn == pnn) {
469 /* dont check nodes that are unavailable */
470 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
474 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
475 mem_ctx, &remote_dbmap);
477 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
481 /* step through all databases on the remote node */
482 for (db=0; db<remote_dbmap->num;db++) {
485 for (i=0;i<(*dbmap)->num;i++) {
486 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
490 /* we already have this db locally */
491 if (i!=(*dbmap)->num) {
494 /* ok so we need to create this database and
497 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
498 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
500 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
501 nodemap->nodes[j].pnn));
504 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
505 remote_dbmap->dbs[db].persistent);
507 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
510 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
512 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
523 pull the remote database contents from one node into the recdb
525 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
526 struct tdb_wrap *recdb, uint32_t dbid,
531 struct ctdb_marshall_buffer *reply;
532 struct ctdb_rec_data *rec;
534 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
536 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
537 CONTROL_TIMEOUT(), &outdata);
539 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
540 talloc_free(tmp_ctx);
544 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
546 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
547 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
548 talloc_free(tmp_ctx);
552 rec = (struct ctdb_rec_data *)&reply->data[0];
556 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
558 struct ctdb_ltdb_header *hdr;
561 key.dptr = &rec->data[0];
562 key.dsize = rec->keylen;
563 data.dptr = &rec->data[key.dsize];
564 data.dsize = rec->datalen;
566 hdr = (struct ctdb_ltdb_header *)data.dptr;
568 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
569 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
570 talloc_free(tmp_ctx);
574 /* fetch the existing record, if any */
575 existing = tdb_fetch(recdb->tdb, key);
577 if (existing.dptr != NULL) {
578 struct ctdb_ltdb_header header;
579 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
580 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
581 (unsigned)existing.dsize, srcnode));
583 talloc_free(tmp_ctx);
586 header = *(struct ctdb_ltdb_header *)existing.dptr;
588 if (!(header.rsn < hdr->rsn ||
589 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
594 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
595 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
596 talloc_free(tmp_ctx);
601 talloc_free(tmp_ctx);
607 pull all the remote database contents into the recdb
609 static int pull_remote_database(struct ctdb_context *ctdb,
610 struct ctdb_recoverd *rec,
611 struct ctdb_node_map *nodemap,
612 struct tdb_wrap *recdb, uint32_t dbid,
617 /* pull all records from all other nodes across onto this node
618 (this merges based on rsn)
620 for (j=0; j<nodemap->num; j++) {
621 /* dont merge from nodes that are unavailable */
622 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
625 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
626 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
627 nodemap->nodes[j].pnn));
628 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
638 update flags on all active nodes
640 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
644 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
646 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
654 ensure all nodes have the same vnnmap we do
656 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
657 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
661 /* push the new vnn map out to all the nodes */
662 for (j=0; j<nodemap->num; j++) {
663 /* dont push to nodes that are unavailable */
664 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
668 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
670 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
680 struct vacuum_info *next, *prev;
681 struct ctdb_recoverd *rec;
683 struct ctdb_db_context *ctdb_db;
684 struct ctdb_marshall_buffer *recs;
685 struct ctdb_rec_data *r;
688 static void vacuum_fetch_next(struct vacuum_info *v);
691 called when a vacuum fetch has completed - just free it and do the next one
693 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
695 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
697 vacuum_fetch_next(v);
702 process the next element from the vacuum list
704 static void vacuum_fetch_next(struct vacuum_info *v)
706 struct ctdb_call call;
707 struct ctdb_rec_data *r;
709 while (v->recs->count) {
710 struct ctdb_client_call_state *state;
712 struct ctdb_ltdb_header *hdr;
715 call.call_id = CTDB_NULL_FUNC;
716 call.flags = CTDB_IMMEDIATE_MIGRATION;
717 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
720 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
723 call.key.dptr = &r->data[0];
724 call.key.dsize = r->keylen;
726 /* ensure we don't block this daemon - just skip a record if we can't get
728 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
732 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
733 if (data.dptr == NULL) {
734 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
738 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
740 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
744 hdr = (struct ctdb_ltdb_header *)data.dptr;
745 if (hdr->dmaster == v->rec->ctdb->pnn) {
746 /* its already local */
748 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
754 state = ctdb_call_send(v->ctdb_db, &call);
755 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
757 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
761 state->async.fn = vacuum_fetch_callback;
762 state->async.private_data = v;
771 destroy a vacuum info structure
773 static int vacuum_info_destructor(struct vacuum_info *v)
775 DLIST_REMOVE(v->rec->vacuum_info, v);
781 handler for vacuum fetch
783 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
784 TDB_DATA data, void *private_data)
786 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
787 struct ctdb_marshall_buffer *recs;
789 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
791 struct ctdb_dbid_map *dbmap=NULL;
792 bool persistent = false;
793 struct ctdb_db_context *ctdb_db;
794 struct ctdb_rec_data *r;
796 struct vacuum_info *v;
798 recs = (struct ctdb_marshall_buffer *)data.dptr;
799 r = (struct ctdb_rec_data *)&recs->data[0];
801 if (recs->count == 0) {
802 talloc_free(tmp_ctx);
808 for (v=rec->vacuum_info;v;v=v->next) {
809 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
810 /* we're already working on records from this node */
811 talloc_free(tmp_ctx);
816 /* work out if the database is persistent */
817 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
819 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
820 talloc_free(tmp_ctx);
824 for (i=0;i<dbmap->num;i++) {
825 if (dbmap->dbs[i].dbid == recs->db_id) {
826 persistent = dbmap->dbs[i].persistent;
830 if (i == dbmap->num) {
831 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
832 talloc_free(tmp_ctx);
836 /* find the name of this database */
837 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
838 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
839 talloc_free(tmp_ctx);
844 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
845 if (ctdb_db == NULL) {
846 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
847 talloc_free(tmp_ctx);
851 v = talloc_zero(rec, struct vacuum_info);
853 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
854 talloc_free(tmp_ctx);
859 v->srcnode = srcnode;
860 v->ctdb_db = ctdb_db;
861 v->recs = talloc_memdup(v, recs, data.dsize);
862 if (v->recs == NULL) {
863 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
865 talloc_free(tmp_ctx);
868 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
870 DLIST_ADD(rec->vacuum_info, v);
872 talloc_set_destructor(v, vacuum_info_destructor);
874 vacuum_fetch_next(v);
875 talloc_free(tmp_ctx);
880 called when ctdb_wait_timeout should finish
882 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
883 struct timeval yt, void *p)
885 uint32_t *timed_out = (uint32_t *)p;
890 wait for a given number of seconds
892 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
894 uint32_t timed_out = 0;
895 time_t usecs = (secs - (time_t)secs) * 1000000;
896 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
898 event_loop_once(ctdb->ev);
903 called when an election times out (ends)
905 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
906 struct timeval t, void *p)
908 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
909 rec->election_timeout = NULL;
912 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
917 wait for an election to finish. It finished election_timeout seconds after
918 the last election packet is received
920 static void ctdb_wait_election(struct ctdb_recoverd *rec)
922 struct ctdb_context *ctdb = rec->ctdb;
923 while (rec->election_timeout) {
924 event_loop_once(ctdb->ev);
929 Update our local flags from all remote connected nodes.
930 This is only run when we are or we belive we are the recovery master
932 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
935 struct ctdb_context *ctdb = rec->ctdb;
936 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
938 /* get the nodemap for all active remote nodes and verify
939 they are the same as for this node
941 for (j=0; j<nodemap->num; j++) {
942 struct ctdb_node_map *remote_nodemap=NULL;
945 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
948 if (nodemap->nodes[j].pnn == ctdb->pnn) {
952 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
953 mem_ctx, &remote_nodemap);
955 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
956 nodemap->nodes[j].pnn));
957 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
958 talloc_free(mem_ctx);
959 return MONITOR_FAILED;
961 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
962 /* We should tell our daemon about this so it
963 updates its flags or else we will log the same
964 message again in the next iteration of recovery.
965 Since we are the recovery master we can just as
966 well update the flags on all nodes.
968 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
970 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
974 /* Update our local copy of the flags in the recovery
977 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
978 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
979 nodemap->nodes[j].flags));
980 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
982 talloc_free(remote_nodemap);
984 talloc_free(mem_ctx);
989 /* Create a new random generation ip.
990 The generation id can not be the INVALID_GENERATION id
992 static uint32_t new_generation(void)
997 generation = random();
999 if (generation != INVALID_GENERATION) {
1009 create a temporary working database
1011 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1014 struct tdb_wrap *recdb;
1017 /* open up the temporary recovery database */
1018 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1019 ctdb->db_directory_state,
1026 tdb_flags = TDB_NOLOCK;
1027 if (ctdb->valgrinding) {
1028 tdb_flags |= TDB_NOMMAP;
1030 tdb_flags |= TDB_DISALLOW_NESTING;
1032 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1033 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1034 if (recdb == NULL) {
1035 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1045 a traverse function for pulling all relevent records from recdb
1048 struct ctdb_context *ctdb;
1049 struct ctdb_marshall_buffer *recdata;
1055 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1057 struct recdb_data *params = (struct recdb_data *)p;
1058 struct ctdb_rec_data *rec;
1059 struct ctdb_ltdb_header *hdr;
1061 /* skip empty records */
1062 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1066 /* update the dmaster field to point to us */
1067 hdr = (struct ctdb_ltdb_header *)data.dptr;
1068 if (!params->persistent) {
1069 hdr->dmaster = params->ctdb->pnn;
1070 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1073 /* add the record to the blob ready to send to the nodes */
1074 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1076 params->failed = true;
1079 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1080 if (params->recdata == NULL) {
1081 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1082 rec->length + params->len, params->recdata->count));
1083 params->failed = true;
1086 params->recdata->count++;
1087 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1088 params->len += rec->length;
1095 push the recdb database out to all nodes
1097 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1099 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1101 struct recdb_data params;
1102 struct ctdb_marshall_buffer *recdata;
1104 TALLOC_CTX *tmp_ctx;
1107 tmp_ctx = talloc_new(ctdb);
1108 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1110 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1111 CTDB_NO_MEMORY(ctdb, recdata);
1113 recdata->db_id = dbid;
1116 params.recdata = recdata;
1117 params.len = offsetof(struct ctdb_marshall_buffer, data);
1118 params.failed = false;
1119 params.persistent = persistent;
1121 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1122 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1123 talloc_free(params.recdata);
1124 talloc_free(tmp_ctx);
1128 if (params.failed) {
1129 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1130 talloc_free(params.recdata);
1131 talloc_free(tmp_ctx);
1135 recdata = params.recdata;
1137 outdata.dptr = (void *)recdata;
1138 outdata.dsize = params.len;
1140 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1141 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1143 CONTROL_TIMEOUT(), false, outdata,
1146 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1147 talloc_free(recdata);
1148 talloc_free(tmp_ctx);
1152 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1153 dbid, recdata->count));
1155 talloc_free(recdata);
1156 talloc_free(tmp_ctx);
1163 go through a full recovery on one database
1165 static int recover_database(struct ctdb_recoverd *rec,
1166 TALLOC_CTX *mem_ctx,
1170 struct ctdb_node_map *nodemap,
1171 uint32_t transaction_id)
1173 struct tdb_wrap *recdb;
1175 struct ctdb_context *ctdb = rec->ctdb;
1177 struct ctdb_control_wipe_database w;
1180 recdb = create_recdb(ctdb, mem_ctx);
1181 if (recdb == NULL) {
1185 /* pull all remote databases onto the recdb */
1186 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1188 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1192 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1194 /* wipe all the remote databases. This is safe as we are in a transaction */
1196 w.transaction_id = transaction_id;
1198 data.dptr = (void *)&w;
1199 data.dsize = sizeof(w);
1201 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1202 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1204 CONTROL_TIMEOUT(), false, data,
1207 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1212 /* push out the correct database. This sets the dmaster and skips
1213 the empty records */
1214 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1220 /* all done with this database */
1227 reload the nodes file
1229 static void reload_nodes_file(struct ctdb_context *ctdb)
1232 ctdb_load_nodes_file(ctdb);
1235 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1236 struct ctdb_recoverd *rec,
1237 struct ctdb_node_map *nodemap,
1243 if (ctdb->num_nodes != nodemap->num) {
1244 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1245 ctdb->num_nodes, nodemap->num));
1247 *culprit = ctdb->pnn;
1252 for (j=0; j<nodemap->num; j++) {
1253 /* release any existing data */
1254 if (ctdb->nodes[j]->known_public_ips) {
1255 talloc_free(ctdb->nodes[j]->known_public_ips);
1256 ctdb->nodes[j]->known_public_ips = NULL;
1258 if (ctdb->nodes[j]->available_public_ips) {
1259 talloc_free(ctdb->nodes[j]->available_public_ips);
1260 ctdb->nodes[j]->available_public_ips = NULL;
1263 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1267 /* grab a new shiny list of public ips from the node */
1268 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1270 ctdb->nodes[j]->pnn,
1273 &ctdb->nodes[j]->known_public_ips);
1275 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1276 ctdb->nodes[j]->pnn));
1278 *culprit = ctdb->nodes[j]->pnn;
1283 if (ctdb->tunable.disable_ip_failover == 0) {
1284 if (rec->ip_check_disable_ctx == NULL) {
1285 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1286 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1287 rec->need_takeover_run = true;
1292 /* grab a new shiny list of public ips from the node */
1293 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1295 ctdb->nodes[j]->pnn,
1297 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1298 &ctdb->nodes[j]->available_public_ips);
1300 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1301 ctdb->nodes[j]->pnn));
1303 *culprit = ctdb->nodes[j]->pnn;
1312 /* when we start a recovery, make sure all nodes use the same reclock file
1315 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1317 struct ctdb_context *ctdb = rec->ctdb;
1318 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1322 if (ctdb->recovery_lock_file == NULL) {
1326 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1327 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1330 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1331 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1337 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1338 talloc_free(tmp_ctx);
1342 talloc_free(tmp_ctx);
1348 * this callback is called for every node that failed to execute ctdb_takeover_run()
1349 * and set flag to re-run takeover run.
1351 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1353 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1355 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
1357 ctdb_set_culprit(rec, node_pnn);
1358 rec->need_takeover_run = true;
1363 we are the recmaster, and recovery is needed - start a recovery run
1365 static int do_recovery(struct ctdb_recoverd *rec,
1366 TALLOC_CTX *mem_ctx, uint32_t pnn,
1367 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1369 struct ctdb_context *ctdb = rec->ctdb;
1371 uint32_t generation;
1372 struct ctdb_dbid_map *dbmap;
1375 struct timeval start_time;
1376 uint32_t culprit = (uint32_t)-1;
1378 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1380 /* if recovery fails, force it again */
1381 rec->need_recovery = true;
1383 for (i=0; i<ctdb->num_nodes; i++) {
1384 struct ctdb_banning_state *ban_state;
1386 if (ctdb->nodes[i]->ban_state == NULL) {
1389 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1390 if (ban_state->count < 2*ctdb->num_nodes) {
1393 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1394 ctdb->nodes[i]->pnn, ban_state->count,
1395 ctdb->tunable.recovery_ban_period));
1396 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1397 ban_state->count = 0;
1401 if (ctdb->tunable.verify_recovery_lock != 0) {
1402 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1403 start_time = timeval_current();
1404 if (!ctdb_recovery_lock(ctdb, true)) {
1405 ctdb_set_culprit(rec, pnn);
1406 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1409 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1410 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1413 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1415 /* get a list of all databases */
1416 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1418 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1422 /* we do the db creation before we set the recovery mode, so the freeze happens
1423 on all databases we will be dealing with. */
1425 /* verify that we have all the databases any other node has */
1426 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1428 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1432 /* verify that all other nodes have all our databases */
1433 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1435 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1438 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1440 /* update the database priority for all remote databases */
1441 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1443 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1445 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1448 /* update all other nodes to use the same setting for reclock files
1449 as the local recovery master.
1451 sync_recovery_lock_file_across_cluster(rec);
1453 /* set recovery mode to active on all nodes */
1454 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1456 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1460 /* execute the "startrecovery" event script on all nodes */
1461 ret = run_startrecovery_eventscript(rec, nodemap);
1463 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1468 update all nodes to have the same flags that we have
1470 for (i=0;i<nodemap->num;i++) {
1471 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1475 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1477 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1482 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1484 /* pick a new generation number */
1485 generation = new_generation();
1487 /* change the vnnmap on this node to use the new generation
1488 number but not on any other nodes.
1489 this guarantees that if we abort the recovery prematurely
1490 for some reason (a node stops responding?)
1491 that we can just return immediately and we will reenter
1492 recovery shortly again.
1493 I.e. we deliberately leave the cluster with an inconsistent
1494 generation id to allow us to abort recovery at any stage and
1495 just restart it from scratch.
1497 vnnmap->generation = generation;
1498 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1500 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1504 data.dptr = (void *)&generation;
1505 data.dsize = sizeof(uint32_t);
1507 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1508 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1510 CONTROL_TIMEOUT(), false, data,
1512 transaction_start_fail_callback,
1514 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1515 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1517 CONTROL_TIMEOUT(), false, tdb_null,
1521 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1526 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1528 for (i=0;i<dbmap->num;i++) {
1529 ret = recover_database(rec, mem_ctx,
1531 dbmap->dbs[i].persistent,
1532 pnn, nodemap, generation);
1534 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1539 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1541 /* commit all the changes */
1542 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1544 CONTROL_TIMEOUT(), false, data,
1547 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1551 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1554 /* update the capabilities for all nodes */
1555 ret = update_capabilities(ctdb, nodemap);
1557 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1561 /* build a new vnn map with all the currently active and
1563 generation = new_generation();
1564 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1565 CTDB_NO_MEMORY(ctdb, vnnmap);
1566 vnnmap->generation = generation;
1568 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1569 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1570 for (i=j=0;i<nodemap->num;i++) {
1571 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1574 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1575 /* this node can not be an lmaster */
1576 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1581 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1582 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1583 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1586 if (vnnmap->size == 0) {
1587 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1589 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1590 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1591 vnnmap->map[0] = pnn;
1594 /* update to the new vnnmap on all nodes */
1595 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1597 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1601 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1603 /* update recmaster to point to us for all nodes */
1604 ret = set_recovery_master(ctdb, nodemap, pnn);
1606 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1610 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1613 update all nodes to have the same flags that we have
1615 for (i=0;i<nodemap->num;i++) {
1616 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1620 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1622 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1627 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1629 /* disable recovery mode */
1630 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1632 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1636 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1639 tell nodes to takeover their public IPs
1641 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1643 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1645 rec->need_takeover_run = true;
1648 rec->need_takeover_run = false;
1649 ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
1651 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1652 rec->need_takeover_run = true;
1655 /* execute the "recovered" event script on all nodes */
1656 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1658 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1662 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1664 /* send a message to all clients telling them that the cluster
1665 has been reconfigured */
1666 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1668 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1670 rec->need_recovery = false;
1672 /* we managed to complete a full recovery, make sure to forgive
1673 any past sins by the nodes that could now participate in the
1676 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1677 for (i=0;i<nodemap->num;i++) {
1678 struct ctdb_banning_state *ban_state;
1680 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1684 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1685 if (ban_state == NULL) {
1689 ban_state->count = 0;
1693 /* We just finished a recovery successfully.
1694 We now wait for rerecovery_timeout before we allow
1695 another recovery to take place.
1697 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1698 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1699 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1706 elections are won by first checking the number of connected nodes, then
1707 the priority time, then the pnn
1709 struct election_message {
1710 uint32_t num_connected;
1711 struct timeval priority_time;
1713 uint32_t node_flags;
1717 form this nodes election data
1719 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1722 struct ctdb_node_map *nodemap;
1723 struct ctdb_context *ctdb = rec->ctdb;
1727 em->pnn = rec->ctdb->pnn;
1728 em->priority_time = rec->priority_time;
1730 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1732 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1736 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1737 em->node_flags = rec->node_flags;
1739 for (i=0;i<nodemap->num;i++) {
1740 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1741 em->num_connected++;
1745 /* we shouldnt try to win this election if we cant be a recmaster */
1746 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1747 em->num_connected = 0;
1748 em->priority_time = timeval_current();
1751 talloc_free(nodemap);
1755 see if the given election data wins
1757 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1759 struct election_message myem;
1762 ctdb_election_data(rec, &myem);
1764 /* we cant win if we dont have the recmaster capability */
1765 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1769 /* we cant win if we are banned */
1770 if (rec->node_flags & NODE_FLAGS_BANNED) {
1774 /* we cant win if we are stopped */
1775 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1779 /* we will automatically win if the other node is banned */
1780 if (em->node_flags & NODE_FLAGS_BANNED) {
1784 /* we will automatically win if the other node is banned */
1785 if (em->node_flags & NODE_FLAGS_STOPPED) {
1789 /* try to use the most connected node */
1791 cmp = (int)myem.num_connected - (int)em->num_connected;
1794 /* then the longest running node */
1796 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1800 cmp = (int)myem.pnn - (int)em->pnn;
1807 send out an election request
1809 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1812 TDB_DATA election_data;
1813 struct election_message emsg;
1815 struct ctdb_context *ctdb = rec->ctdb;
1817 srvid = CTDB_SRVID_RECOVERY;
1819 ctdb_election_data(rec, &emsg);
1821 election_data.dsize = sizeof(struct election_message);
1822 election_data.dptr = (unsigned char *)&emsg;
1825 /* send an election message to all active nodes */
1826 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1827 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1830 /* A new node that is already frozen has entered the cluster.
1831 The existing nodes are not frozen and dont need to be frozen
1832 until the election has ended and we start the actual recovery
1834 if (update_recmaster == true) {
1835 /* first we assume we will win the election and set
1836 recoverymaster to be ourself on the current node
1838 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1840 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1850 this function will unban all nodes in the cluster
1852 static void unban_all_nodes(struct ctdb_context *ctdb)
1855 struct ctdb_node_map *nodemap;
1856 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1858 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1860 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1864 for (i=0;i<nodemap->num;i++) {
1865 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1866 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1867 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1871 talloc_free(tmp_ctx);
1876 we think we are winning the election - send a broadcast election request
1878 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1880 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1883 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1885 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1888 talloc_free(rec->send_election_te);
1889 rec->send_election_te = NULL;
1893 handler for memory dumps
1895 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1896 TDB_DATA data, void *private_data)
1898 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1901 struct rd_memdump_reply *rd;
1903 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1904 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1905 talloc_free(tmp_ctx);
1908 rd = (struct rd_memdump_reply *)data.dptr;
1910 dump = talloc_zero(tmp_ctx, TDB_DATA);
1912 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1913 talloc_free(tmp_ctx);
1916 ret = ctdb_dump_memory(ctdb, dump);
1918 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1919 talloc_free(tmp_ctx);
1923 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1925 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1927 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1928 talloc_free(tmp_ctx);
1932 talloc_free(tmp_ctx);
1936 handler for reload_nodes
1938 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1939 TDB_DATA data, void *private_data)
1941 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1943 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1945 reload_nodes_file(rec->ctdb);
1949 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1950 struct timeval yt, void *p)
1952 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1954 talloc_free(rec->ip_check_disable_ctx);
1955 rec->ip_check_disable_ctx = NULL;
1959 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1960 TDB_DATA data, void *private_data)
1962 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1963 struct ctdb_public_ip *ip;
1965 if (rec->recmaster != rec->ctdb->pnn) {
1966 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
1970 if (data.dsize != sizeof(struct ctdb_public_ip)) {
1971 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
1975 ip = (struct ctdb_public_ip *)data.dptr;
1977 update_ip_assignment_tree(rec->ctdb, ip);
1981 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1982 TDB_DATA data, void *private_data)
1984 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1987 if (rec->ip_check_disable_ctx != NULL) {
1988 talloc_free(rec->ip_check_disable_ctx);
1989 rec->ip_check_disable_ctx = NULL;
1992 if (data.dsize != sizeof(uint32_t)) {
1993 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1994 "expexting %lu\n", (long unsigned)data.dsize,
1995 (long unsigned)sizeof(uint32_t)));
1998 if (data.dptr == NULL) {
1999 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2003 timeout = *((uint32_t *)data.dptr);
2004 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2006 rec->ip_check_disable_ctx = talloc_new(rec);
2007 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2009 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2014 handler for ip reallocate, just add it to the list of callers and
2015 handle this later in the monitor_cluster loop so we do not recurse
2016 with other callers to takeover_run()
2018 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2019 TDB_DATA data, void *private_data)
2021 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2022 struct ip_reallocate_list *caller;
2024 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2025 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2029 if (rec->ip_reallocate_ctx == NULL) {
2030 rec->ip_reallocate_ctx = talloc_new(rec);
2031 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2034 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2035 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2037 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2038 caller->next = rec->reallocate_callers;
2039 rec->reallocate_callers = caller;
2044 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2046 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2049 struct ip_reallocate_list *callers;
2052 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2054 /* update the list of public ips that a node can handle for
2057 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2059 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2061 rec->need_takeover_run = true;
2064 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2066 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2067 rec->need_takeover_run = true;
2071 result.dsize = sizeof(int32_t);
2072 result.dptr = (uint8_t *)&ret;
2074 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2076 /* Someone that sent srvid==0 does not want a reply */
2077 if (callers->rd->srvid == 0) {
2080 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2081 "%u:%llu\n", (unsigned)callers->rd->pnn,
2082 (unsigned long long)callers->rd->srvid));
2083 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2085 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2086 "message to %u:%llu\n",
2087 (unsigned)callers->rd->pnn,
2088 (unsigned long long)callers->rd->srvid));
2092 talloc_free(tmp_ctx);
2093 talloc_free(rec->ip_reallocate_ctx);
2094 rec->ip_reallocate_ctx = NULL;
2095 rec->reallocate_callers = NULL;
2101 handler for recovery master elections
2103 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2104 TDB_DATA data, void *private_data)
2106 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2108 struct election_message *em = (struct election_message *)data.dptr;
2109 TALLOC_CTX *mem_ctx;
2111 /* we got an election packet - update the timeout for the election */
2112 talloc_free(rec->election_timeout);
2113 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2115 timeval_current_ofs(0, 500000) :
2116 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2117 ctdb_election_timeout, rec);
2119 mem_ctx = talloc_new(ctdb);
2121 /* someone called an election. check their election data
2122 and if we disagree and we would rather be the elected node,
2123 send a new election message to all other nodes
2125 if (ctdb_election_win(rec, em)) {
2126 if (!rec->send_election_te) {
2127 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2128 timeval_current_ofs(0, 500000),
2129 election_send_request, rec);
2131 talloc_free(mem_ctx);
2132 /*unban_all_nodes(ctdb);*/
2137 talloc_free(rec->send_election_te);
2138 rec->send_election_te = NULL;
2140 if (ctdb->tunable.verify_recovery_lock != 0) {
2141 /* release the recmaster lock */
2142 if (em->pnn != ctdb->pnn &&
2143 ctdb->recovery_lock_fd != -1) {
2144 close(ctdb->recovery_lock_fd);
2145 ctdb->recovery_lock_fd = -1;
2146 unban_all_nodes(ctdb);
2150 /* ok, let that guy become recmaster then */
2151 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2153 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2154 talloc_free(mem_ctx);
2158 talloc_free(mem_ctx);
2164 force the start of the election process
2166 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2167 struct ctdb_node_map *nodemap)
2170 struct ctdb_context *ctdb = rec->ctdb;
2172 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2174 /* set all nodes to recovery mode to stop all internode traffic */
2175 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2177 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2181 talloc_free(rec->election_timeout);
2182 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2184 timeval_current_ofs(0, 500000) :
2185 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2186 ctdb_election_timeout, rec);
2188 ret = send_election_request(rec, pnn, true);
2190 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2194 /* wait for a few seconds to collect all responses */
2195 ctdb_wait_election(rec);
2201 handler for when a node changes its flags
2203 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2204 TDB_DATA data, void *private_data)
2207 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2208 struct ctdb_node_map *nodemap=NULL;
2209 TALLOC_CTX *tmp_ctx;
2210 uint32_t changed_flags;
2212 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2213 int disabled_flag_changed;
2215 if (data.dsize != sizeof(*c)) {
2216 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2220 tmp_ctx = talloc_new(ctdb);
2221 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2223 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2225 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2226 talloc_free(tmp_ctx);
2231 for (i=0;i<nodemap->num;i++) {
2232 if (nodemap->nodes[i].pnn == c->pnn) break;
2235 if (i == nodemap->num) {
2236 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2237 talloc_free(tmp_ctx);
2241 changed_flags = c->old_flags ^ c->new_flags;
2243 if (nodemap->nodes[i].flags != c->new_flags) {
2244 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2247 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2249 nodemap->nodes[i].flags = c->new_flags;
2251 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2252 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2255 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2256 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2260 ctdb->recovery_master == ctdb->pnn &&
2261 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2262 /* Only do the takeover run if the perm disabled or unhealthy
2263 flags changed since these will cause an ip failover but not
2265 If the node became disconnected or banned this will also
2266 lead to an ip address failover but that is handled
2269 if (disabled_flag_changed) {
2270 rec->need_takeover_run = true;
2274 talloc_free(tmp_ctx);
2278 handler for when we need to push out flag changes ot all other nodes
2280 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2281 TDB_DATA data, void *private_data)
2284 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2285 struct ctdb_node_map *nodemap=NULL;
2286 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2290 /* find the recovery master */
2291 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2293 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2294 talloc_free(tmp_ctx);
2298 /* read the node flags from the recmaster */
2299 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2301 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2302 talloc_free(tmp_ctx);
2305 if (c->pnn >= nodemap->num) {
2306 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2307 talloc_free(tmp_ctx);
2311 /* send the flags update to all connected nodes */
2312 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2314 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2315 nodes, 0, CONTROL_TIMEOUT(),
2319 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2321 talloc_free(tmp_ctx);
2325 talloc_free(tmp_ctx);
2329 struct verify_recmode_normal_data {
2331 enum monitor_result status;
2334 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2336 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2339 /* one more node has responded with recmode data*/
2342 /* if we failed to get the recmode, then return an error and let
2343 the main loop try again.
2345 if (state->state != CTDB_CONTROL_DONE) {
2346 if (rmdata->status == MONITOR_OK) {
2347 rmdata->status = MONITOR_FAILED;
2352 /* if we got a response, then the recmode will be stored in the
2355 if (state->status != CTDB_RECOVERY_NORMAL) {
2356 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2357 rmdata->status = MONITOR_RECOVERY_NEEDED;
2364 /* verify that all nodes are in normal recovery mode */
2365 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2367 struct verify_recmode_normal_data *rmdata;
2368 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2369 struct ctdb_client_control_state *state;
2370 enum monitor_result status;
2373 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2374 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2376 rmdata->status = MONITOR_OK;
2378 /* loop over all active nodes and send an async getrecmode call to
2380 for (j=0; j<nodemap->num; j++) {
2381 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2384 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2386 nodemap->nodes[j].pnn);
2387 if (state == NULL) {
2388 /* we failed to send the control, treat this as
2389 an error and try again next iteration
2391 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2392 talloc_free(mem_ctx);
2393 return MONITOR_FAILED;
2396 /* set up the callback functions */
2397 state->async.fn = verify_recmode_normal_callback;
2398 state->async.private_data = rmdata;
2400 /* one more control to wait for to complete */
2405 /* now wait for up to the maximum number of seconds allowed
2406 or until all nodes we expect a response from has replied
2408 while (rmdata->count > 0) {
2409 event_loop_once(ctdb->ev);
2412 status = rmdata->status;
2413 talloc_free(mem_ctx);
2418 struct verify_recmaster_data {
2419 struct ctdb_recoverd *rec;
2422 enum monitor_result status;
2425 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2427 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2430 /* one more node has responded with recmaster data*/
2433 /* if we failed to get the recmaster, then return an error and let
2434 the main loop try again.
2436 if (state->state != CTDB_CONTROL_DONE) {
2437 if (rmdata->status == MONITOR_OK) {
2438 rmdata->status = MONITOR_FAILED;
2443 /* if we got a response, then the recmaster will be stored in the
2446 if (state->status != rmdata->pnn) {
2447 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2448 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2449 rmdata->status = MONITOR_ELECTION_NEEDED;
2456 /* verify that all nodes agree that we are the recmaster */
2457 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2459 struct ctdb_context *ctdb = rec->ctdb;
2460 struct verify_recmaster_data *rmdata;
2461 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2462 struct ctdb_client_control_state *state;
2463 enum monitor_result status;
2466 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2467 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2471 rmdata->status = MONITOR_OK;
2473 /* loop over all active nodes and send an async getrecmaster call to
2475 for (j=0; j<nodemap->num; j++) {
2476 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2479 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2481 nodemap->nodes[j].pnn);
2482 if (state == NULL) {
2483 /* we failed to send the control, treat this as
2484 an error and try again next iteration
2486 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2487 talloc_free(mem_ctx);
2488 return MONITOR_FAILED;
2491 /* set up the callback functions */
2492 state->async.fn = verify_recmaster_callback;
2493 state->async.private_data = rmdata;
2495 /* one more control to wait for to complete */
2500 /* now wait for up to the maximum number of seconds allowed
2501 or until all nodes we expect a response from has replied
2503 while (rmdata->count > 0) {
2504 event_loop_once(ctdb->ev);
2507 status = rmdata->status;
2508 talloc_free(mem_ctx);
2513 /* called to check that the local allocation of public ip addresses is ok.
2515 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2517 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2518 struct ctdb_control_get_ifaces *ifaces = NULL;
2519 struct ctdb_all_public_ips *ips = NULL;
2520 struct ctdb_uptime *uptime1 = NULL;
2521 struct ctdb_uptime *uptime2 = NULL;
2523 bool need_iface_check = false;
2524 bool need_takeover_run = false;
2526 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2527 CTDB_CURRENT_NODE, &uptime1);
2529 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2530 talloc_free(mem_ctx);
2535 /* read the interfaces from the local node */
2536 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2538 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2539 talloc_free(mem_ctx);
2544 need_iface_check = true;
2545 } else if (rec->ifaces->num != ifaces->num) {
2546 need_iface_check = true;
2547 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2548 need_iface_check = true;
2551 if (need_iface_check) {
2552 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2553 "local node %u - force takeover run\n",
2555 need_takeover_run = true;
2558 /* read the ip allocation from the local node */
2559 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2561 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2562 talloc_free(mem_ctx);
2566 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2567 CTDB_CURRENT_NODE, &uptime2);
2569 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2570 talloc_free(mem_ctx);
2574 /* skip the check if the startrecovery time has changed */
2575 if (timeval_compare(&uptime1->last_recovery_started,
2576 &uptime2->last_recovery_started) != 0) {
2577 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2578 talloc_free(mem_ctx);
2582 /* skip the check if the endrecovery time has changed */
2583 if (timeval_compare(&uptime1->last_recovery_finished,
2584 &uptime2->last_recovery_finished) != 0) {
2585 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2586 talloc_free(mem_ctx);
2590 /* skip the check if we have started but not finished recovery */
2591 if (timeval_compare(&uptime1->last_recovery_finished,
2592 &uptime1->last_recovery_started) != 1) {
2593 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2594 talloc_free(mem_ctx);
2599 talloc_free(rec->ifaces);
2600 rec->ifaces = talloc_steal(rec, ifaces);
2602 /* verify that we have the ip addresses we should have
2603 and we dont have ones we shouldnt have.
2604 if we find an inconsistency we set recmode to
2605 active on the local node and wait for the recmaster
2606 to do a full blown recovery.
2607 also if the pnn is -1 and we are healthy and can host the ip
2608 we also request a ip reallocation.
2610 if (ctdb->tunable.disable_ip_failover == 0) {
2611 for (j=0; j<ips->num; j++) {
2612 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2613 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2614 ctdb_addr_to_str(&ips->ips[j].addr)));
2615 need_takeover_run = true;
2616 } else if (ips->ips[j].pnn == pnn) {
2617 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2618 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2619 ctdb_addr_to_str(&ips->ips[j].addr)));
2620 need_takeover_run = true;
2623 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2624 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2625 ctdb_addr_to_str(&ips->ips[j].addr)));
2626 need_takeover_run = true;
2632 if (need_takeover_run) {
2633 struct takeover_run_reply rd;
2636 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2640 data.dptr = (uint8_t *)&rd;
2641 data.dsize = sizeof(rd);
2643 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2645 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2648 talloc_free(mem_ctx);
2653 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2655 struct ctdb_node_map **remote_nodemaps = callback_data;
2657 if (node_pnn >= ctdb->num_nodes) {
2658 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2662 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2666 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2667 struct ctdb_node_map *nodemap,
2668 struct ctdb_node_map **remote_nodemaps)
2672 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2673 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2675 CONTROL_TIMEOUT(), false, tdb_null,
2676 async_getnodemap_callback,
2678 remote_nodemaps) != 0) {
2679 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2687 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2688 struct ctdb_check_reclock_state {
2689 struct ctdb_context *ctdb;
2690 struct timeval start_time;
2693 struct timed_event *te;
2694 struct fd_event *fde;
2695 enum reclock_child_status status;
2698 /* when we free the reclock state we must kill any child process.
2700 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2702 struct ctdb_context *ctdb = state->ctdb;
2704 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2706 if (state->fd[0] != -1) {
2707 close(state->fd[0]);
2710 if (state->fd[1] != -1) {
2711 close(state->fd[1]);
2714 kill(state->child, SIGKILL);
2719 called if our check_reclock child times out. this would happen if
2720 i/o to the reclock file blocks.
2722 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2723 struct timeval t, void *private_data)
2725 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2726 struct ctdb_check_reclock_state);
2728 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2729 state->status = RECLOCK_TIMEOUT;
2732 /* this is called when the child process has completed checking the reclock
2733 file and has written data back to us through the pipe.
2735 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2736 uint16_t flags, void *private_data)
2738 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2739 struct ctdb_check_reclock_state);
2743 /* we got a response from our child process so we can abort the
2746 talloc_free(state->te);
2749 ret = read(state->fd[0], &c, 1);
2750 if (ret != 1 || c != RECLOCK_OK) {
2751 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2752 state->status = RECLOCK_FAILED;
2757 state->status = RECLOCK_OK;
2761 static int check_recovery_lock(struct ctdb_context *ctdb)
2764 struct ctdb_check_reclock_state *state;
2765 pid_t parent = getpid();
2767 if (ctdb->recovery_lock_fd == -1) {
2768 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2772 state = talloc(ctdb, struct ctdb_check_reclock_state);
2773 CTDB_NO_MEMORY(ctdb, state);
2776 state->start_time = timeval_current();
2777 state->status = RECLOCK_CHECKING;
2781 ret = pipe(state->fd);
2784 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2788 state->child = ctdb_fork(ctdb);
2789 if (state->child == (pid_t)-1) {
2790 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2791 close(state->fd[0]);
2793 close(state->fd[1]);
2799 if (state->child == 0) {
2800 char cc = RECLOCK_OK;
2801 close(state->fd[0]);
2804 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
2805 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2806 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2807 cc = RECLOCK_FAILED;
2810 write(state->fd[1], &cc, 1);
2811 /* make sure we die when our parent dies */
2812 while (kill(parent, 0) == 0 || errno != ESRCH) {
2814 write(state->fd[1], &cc, 1);
2818 close(state->fd[1]);
2820 set_close_on_exec(state->fd[0]);
2822 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2824 talloc_set_destructor(state, check_reclock_destructor);
2826 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2827 ctdb_check_reclock_timeout, state);
2828 if (state->te == NULL) {
2829 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2834 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2836 reclock_child_handler,
2839 if (state->fde == NULL) {
2840 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2844 tevent_fd_set_auto_close(state->fde);
2846 while (state->status == RECLOCK_CHECKING) {
2847 event_loop_once(ctdb->ev);
2850 if (state->status == RECLOCK_FAILED) {
2851 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2852 close(ctdb->recovery_lock_fd);
2853 ctdb->recovery_lock_fd = -1;
2862 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2864 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2865 const char *reclockfile;
2867 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2868 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2869 talloc_free(tmp_ctx);
2873 if (reclockfile == NULL) {
2874 if (ctdb->recovery_lock_file != NULL) {
2875 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2876 talloc_free(ctdb->recovery_lock_file);
2877 ctdb->recovery_lock_file = NULL;
2878 if (ctdb->recovery_lock_fd != -1) {
2879 close(ctdb->recovery_lock_fd);
2880 ctdb->recovery_lock_fd = -1;
2883 ctdb->tunable.verify_recovery_lock = 0;
2884 talloc_free(tmp_ctx);
2888 if (ctdb->recovery_lock_file == NULL) {
2889 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2890 if (ctdb->recovery_lock_fd != -1) {
2891 close(ctdb->recovery_lock_fd);
2892 ctdb->recovery_lock_fd = -1;
2894 talloc_free(tmp_ctx);
2899 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2900 talloc_free(tmp_ctx);
2904 talloc_free(ctdb->recovery_lock_file);
2905 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2906 ctdb->tunable.verify_recovery_lock = 0;
2907 if (ctdb->recovery_lock_fd != -1) {
2908 close(ctdb->recovery_lock_fd);
2909 ctdb->recovery_lock_fd = -1;
2912 talloc_free(tmp_ctx);
2916 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2917 TALLOC_CTX *mem_ctx)
2920 struct ctdb_node_map *nodemap=NULL;
2921 struct ctdb_node_map *recmaster_nodemap=NULL;
2922 struct ctdb_node_map **remote_nodemaps=NULL;
2923 struct ctdb_vnn_map *vnnmap=NULL;
2924 struct ctdb_vnn_map *remote_vnnmap=NULL;
2925 int32_t debug_level;
2930 /* verify that the main daemon is still running */
2931 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2932 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2936 /* ping the local daemon to tell it we are alive */
2937 ctdb_ctrl_recd_ping(ctdb);
2939 if (rec->election_timeout) {
2940 /* an election is in progress */
2944 /* read the debug level from the parent and update locally */
2945 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2947 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2950 LogLevel = debug_level;
2953 /* We must check if we need to ban a node here but we want to do this
2954 as early as possible so we dont wait until we have pulled the node
2955 map from the local node. thats why we have the hardcoded value 20
2957 for (i=0; i<ctdb->num_nodes; i++) {
2958 struct ctdb_banning_state *ban_state;
2960 if (ctdb->nodes[i]->ban_state == NULL) {
2963 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2964 if (ban_state->count < 20) {
2967 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2968 ctdb->nodes[i]->pnn, ban_state->count,
2969 ctdb->tunable.recovery_ban_period));
2970 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2971 ban_state->count = 0;
2974 /* get relevant tunables */
2975 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2977 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2981 /* get the current recovery lock file from the server */
2982 if (update_recovery_lock_file(ctdb) != 0) {
2983 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2987 /* Make sure that if recovery lock verification becomes disabled when
2990 if (ctdb->tunable.verify_recovery_lock == 0) {
2991 if (ctdb->recovery_lock_fd != -1) {
2992 close(ctdb->recovery_lock_fd);
2993 ctdb->recovery_lock_fd = -1;
2997 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2998 if (pnn == (uint32_t)-1) {
2999 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3003 /* get the vnnmap */
3004 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3006 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3011 /* get number of nodes */
3013 talloc_free(rec->nodemap);
3014 rec->nodemap = NULL;
3017 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3019 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3022 nodemap = rec->nodemap;
3024 /* check which node is the recovery master */
3025 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3027 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3031 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3032 if (rec->recmaster != pnn) {
3033 if (rec->ip_reallocate_ctx != NULL) {
3034 talloc_free(rec->ip_reallocate_ctx);
3035 rec->ip_reallocate_ctx = NULL;
3036 rec->reallocate_callers = NULL;
3039 /* if there are takeovers requested, perform it and notify the waiters */
3040 if (rec->reallocate_callers) {
3041 process_ipreallocate_requests(ctdb, rec);
3044 if (rec->recmaster == (uint32_t)-1) {
3045 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3046 force_election(rec, pnn, nodemap);
3051 /* if the local daemon is STOPPED, we verify that the databases are
3052 also frozen and thet the recmode is set to active
3054 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3055 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3057 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3059 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3060 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3062 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3064 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3067 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3069 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3076 /* If the local node is stopped, verify we are not the recmaster
3077 and yield this role if so
3079 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3080 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3081 force_election(rec, pnn, nodemap);
3085 /* check that we (recovery daemon) and the local ctdb daemon
3086 agrees on whether we are banned or not
3090 /* remember our own node flags */
3091 rec->node_flags = nodemap->nodes[pnn].flags;
3093 /* count how many active nodes there are */
3094 rec->num_active = 0;
3095 rec->num_connected = 0;
3096 for (i=0; i<nodemap->num; i++) {
3097 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3100 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3101 rec->num_connected++;
3106 /* verify that the recmaster node is still active */
3107 for (j=0; j<nodemap->num; j++) {
3108 if (nodemap->nodes[j].pnn==rec->recmaster) {
3113 if (j == nodemap->num) {
3114 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3115 force_election(rec, pnn, nodemap);
3119 /* if recovery master is disconnected we must elect a new recmaster */
3120 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3121 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3122 force_election(rec, pnn, nodemap);
3126 /* grap the nodemap from the recovery master to check if it is banned */
3127 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3128 mem_ctx, &recmaster_nodemap);
3130 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3131 nodemap->nodes[j].pnn));
3136 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3137 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3138 force_election(rec, pnn, nodemap);
3143 /* verify that we have all ip addresses we should have and we dont
3144 * have addresses we shouldnt have.
3146 if (ctdb->tunable.disable_ip_failover == 0) {
3147 if (rec->ip_check_disable_ctx == NULL) {
3148 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3149 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3155 /* if we are not the recmaster then we do not need to check
3156 if recovery is needed
3158 if (pnn != rec->recmaster) {
3163 /* ensure our local copies of flags are right */
3164 ret = update_local_flags(rec, nodemap);
3165 if (ret == MONITOR_ELECTION_NEEDED) {
3166 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3167 force_election(rec, pnn, nodemap);
3170 if (ret != MONITOR_OK) {
3171 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3175 if (ctdb->num_nodes != nodemap->num) {
3176 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3177 reload_nodes_file(ctdb);
3181 /* verify that all active nodes agree that we are the recmaster */
3182 switch (verify_recmaster(rec, nodemap, pnn)) {
3183 case MONITOR_RECOVERY_NEEDED:
3184 /* can not happen */
3186 case MONITOR_ELECTION_NEEDED:
3187 force_election(rec, pnn, nodemap);
3191 case MONITOR_FAILED:
3196 if (rec->need_recovery) {
3197 /* a previous recovery didn't finish */
3198 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3202 /* verify that all active nodes are in normal mode
3203 and not in recovery mode
3205 switch (verify_recmode(ctdb, nodemap)) {
3206 case MONITOR_RECOVERY_NEEDED:
3207 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3209 case MONITOR_FAILED:
3211 case MONITOR_ELECTION_NEEDED:
3212 /* can not happen */
3218 if (ctdb->tunable.verify_recovery_lock != 0) {
3219 /* we should have the reclock - check its not stale */
3220 ret = check_recovery_lock(ctdb);
3222 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3223 ctdb_set_culprit(rec, ctdb->pnn);
3224 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3229 /* get the nodemap for all active remote nodes
3231 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3232 if (remote_nodemaps == NULL) {
3233 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3236 for(i=0; i<nodemap->num; i++) {
3237 remote_nodemaps[i] = NULL;
3239 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3240 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3244 /* verify that all other nodes have the same nodemap as we have
3246 for (j=0; j<nodemap->num; j++) {
3247 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3251 if (remote_nodemaps[j] == NULL) {
3252 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3253 ctdb_set_culprit(rec, j);
3258 /* if the nodes disagree on how many nodes there are
3259 then this is a good reason to try recovery
3261 if (remote_nodemaps[j]->num != nodemap->num) {
3262 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3263 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3264 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3265 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3269 /* if the nodes disagree on which nodes exist and are
3270 active, then that is also a good reason to do recovery
3272 for (i=0;i<nodemap->num;i++) {
3273 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3274 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3275 nodemap->nodes[j].pnn, i,
3276 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3277 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3278 do_recovery(rec, mem_ctx, pnn, nodemap,
3284 /* verify the flags are consistent
3286 for (i=0; i<nodemap->num; i++) {
3287 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3291 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3292 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3293 nodemap->nodes[j].pnn,
3294 nodemap->nodes[i].pnn,
3295 remote_nodemaps[j]->nodes[i].flags,
3296 nodemap->nodes[j].flags));
3298 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3299 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3300 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3301 do_recovery(rec, mem_ctx, pnn, nodemap,
3305 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3306 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3307 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3308 do_recovery(rec, mem_ctx, pnn, nodemap,
3317 /* there better be the same number of lmasters in the vnn map
3318 as there are active nodes or we will have to do a recovery
3320 if (vnnmap->size != rec->num_active) {
3321 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3322 vnnmap->size, rec->num_active));
3323 ctdb_set_culprit(rec, ctdb->pnn);
3324 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3328 /* verify that all active nodes in the nodemap also exist in
3331 for (j=0; j<nodemap->num; j++) {
3332 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3335 if (nodemap->nodes[j].pnn == pnn) {
3339 for (i=0; i<vnnmap->size; i++) {
3340 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3344 if (i == vnnmap->size) {
3345 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3346 nodemap->nodes[j].pnn));
3347 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3348 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3354 /* verify that all other nodes have the same vnnmap
3355 and are from the same generation
3357 for (j=0; j<nodemap->num; j++) {
3358 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3361 if (nodemap->nodes[j].pnn == pnn) {
3365 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3366 mem_ctx, &remote_vnnmap);
3368 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3369 nodemap->nodes[j].pnn));
3373 /* verify the vnnmap generation is the same */
3374 if (vnnmap->generation != remote_vnnmap->generation) {
3375 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3376 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3377 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3378 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3382 /* verify the vnnmap size is the same */
3383 if (vnnmap->size != remote_vnnmap->size) {
3384 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3385 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3386 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3387 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3391 /* verify the vnnmap is the same */
3392 for (i=0;i<vnnmap->size;i++) {
3393 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3394 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3395 nodemap->nodes[j].pnn));
3396 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3397 do_recovery(rec, mem_ctx, pnn, nodemap,
3404 /* we might need to change who has what IP assigned */
3405 if (rec->need_takeover_run) {
3406 uint32_t culprit = (uint32_t)-1;
3408 rec->need_takeover_run = false;
3410 /* update the list of public ips that a node can handle for
3413 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3415 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3417 rec->need_takeover_run = true;
3421 /* execute the "startrecovery" event script on all nodes */
3422 ret = run_startrecovery_eventscript(rec, nodemap);
3424 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3425 ctdb_set_culprit(rec, ctdb->pnn);
3426 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3430 /* If takeover run fails, then the offending nodes are
3431 * assigned ban culprit counts. And we re-try takeover.
3432 * If takeover run fails repeatedly, the node would get
3435 * If rec->need_takeover_run is not set to true at this
3436 * failure, monitoring is disabled cluster-wide (via
3437 * startrecovery eventscript) and will not get enabled.
3439 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
3441 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
3445 /* execute the "recovered" event script on all nodes */
3446 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3448 // we cant check whether the event completed successfully
3449 // since this script WILL fail if the node is in recovery mode
3450 // and if that race happens, the code here would just cause a second
3451 // cascading recovery.
3453 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3454 ctdb_set_culprit(rec, ctdb->pnn);
3455 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3462 the main monitoring loop
3464 static void monitor_cluster(struct ctdb_context *ctdb)
3466 struct ctdb_recoverd *rec;
3468 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3470 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3471 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3475 rec->priority_time = timeval_current();
3477 /* register a message port for sending memory dumps */
3478 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3480 /* register a message port for recovery elections */
3481 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3483 /* when nodes are disabled/enabled */
3484 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3486 /* when we are asked to puch out a flag change */
3487 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3489 /* register a message port for vacuum fetch */
3490 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3492 /* register a message port for reloadnodes */
3493 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3495 /* register a message port for performing a takeover run */
3496 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3498 /* register a message port for disabling the ip check for a short while */
3499 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3501 /* register a message port for updating the recovery daemons node assignment for an ip */
3502 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3505 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3506 struct timeval start;
3510 DEBUG(DEBUG_CRIT,(__location__
3511 " Failed to create temp context\n"));
3515 start = timeval_current();
3516 main_loop(ctdb, rec, mem_ctx);
3517 talloc_free(mem_ctx);
3519 /* we only check for recovery once every second */
3520 elapsed = timeval_elapsed(&start);
3521 if (elapsed < ctdb->tunable.recover_interval) {
3522 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3529 event handler for when the main ctdbd dies
3531 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3532 uint16_t flags, void *private_data)
3534 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3539 called regularly to verify that the recovery daemon is still running
3541 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3542 struct timeval yt, void *p)
3544 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3546 if (kill(ctdb->recoverd_pid, 0) != 0) {
3547 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3549 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3550 ctdb_restart_recd, ctdb);
3555 event_add_timed(ctdb->ev, ctdb,
3556 timeval_current_ofs(30, 0),
3557 ctdb_check_recd, ctdb);
3560 static void recd_sig_child_handler(struct event_context *ev,
3561 struct signal_event *se, int signum, int count,
3565 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3570 pid = waitpid(-1, &status, WNOHANG);
3572 if (errno != ECHILD) {
3573 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3578 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3584 startup the recovery daemon as a child of the main ctdb daemon
3586 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3589 struct signal_event *se;
3590 struct tevent_fd *fde;
3592 if (pipe(fd) != 0) {
3596 ctdb->ctdbd_pid = getpid();
3598 ctdb->recoverd_pid = fork();
3599 if (ctdb->recoverd_pid == -1) {
3603 if (ctdb->recoverd_pid != 0) {
3605 event_add_timed(ctdb->ev, ctdb,
3606 timeval_current_ofs(30, 0),
3607 ctdb_check_recd, ctdb);
3613 srandom(getpid() ^ time(NULL));
3615 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3616 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3620 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3622 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3623 ctdb_recoverd_parent, &fd[0]);
3624 tevent_fd_set_auto_close(fde);
3626 /* set up a handler to pick up sigchld */
3627 se = event_add_signal(ctdb->ev, ctdb,
3629 recd_sig_child_handler,
3632 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3636 monitor_cluster(ctdb);
3638 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3643 shutdown the recovery daemon
3645 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3647 if (ctdb->recoverd_pid == 0) {
3651 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3652 kill(ctdb->recoverd_pid, SIGTERM);
3655 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
3656 struct timeval t, void *private_data)
3658 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3660 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3661 ctdb_stop_recoverd(ctdb);
3662 ctdb_start_recoverd(ctdb);