4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
75 ban a node for a period of time
77 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
80 struct ctdb_context *ctdb = rec->ctdb;
81 struct ctdb_ban_time bantime;
83 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
85 if (!ctdb_validate_pnn(ctdb, pnn)) {
86 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
91 bantime.time = ban_time;
93 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
95 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
101 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
105 run the "recovered" eventscript on all nodes
107 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
112 tmp_ctx = talloc_new(ctdb);
113 CTDB_NO_MEMORY(ctdb, tmp_ctx);
115 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
116 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
118 CONTROL_TIMEOUT(), false, tdb_null,
121 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
123 talloc_free(tmp_ctx);
127 talloc_free(tmp_ctx);
132 remember the trouble maker
134 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
136 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
137 struct ctdb_banning_state *ban_state;
139 if (culprit > ctdb->num_nodes) {
140 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
144 if (ctdb->nodes[culprit]->ban_state == NULL) {
145 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
146 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
150 ban_state = ctdb->nodes[culprit]->ban_state;
151 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
152 /* this was the first time in a long while this node
153 misbehaved so we will forgive any old transgressions.
155 ban_state->count = 0;
158 ban_state->count += count;
159 ban_state->last_reported_time = timeval_current();
160 rec->last_culprit_node = culprit;
164 remember the trouble maker
166 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
168 ctdb_set_culprit_count(rec, culprit, 1);
172 /* this callback is called for every node that failed to execute the
175 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
177 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
179 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
181 ctdb_set_culprit(rec, node_pnn);
185 run the "startrecovery" eventscript on all nodes
187 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
191 struct ctdb_context *ctdb = rec->ctdb;
193 tmp_ctx = talloc_new(ctdb);
194 CTDB_NO_MEMORY(ctdb, tmp_ctx);
196 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
197 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
199 CONTROL_TIMEOUT(), false, tdb_null,
201 startrecovery_fail_callback,
203 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
204 talloc_free(tmp_ctx);
208 talloc_free(tmp_ctx);
212 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
214 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
215 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
218 if (node_pnn < ctdb->num_nodes) {
219 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
224 update the node capabilities for all connected nodes
226 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
231 tmp_ctx = talloc_new(ctdb);
232 CTDB_NO_MEMORY(ctdb, tmp_ctx);
234 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
235 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
239 async_getcap_callback, NULL,
241 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
242 talloc_free(tmp_ctx);
246 talloc_free(tmp_ctx);
250 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
252 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
254 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
255 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
258 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
260 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
262 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
263 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
267 change recovery mode on all nodes
269 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
275 tmp_ctx = talloc_new(ctdb);
276 CTDB_NO_MEMORY(ctdb, tmp_ctx);
278 /* freeze all nodes */
279 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
280 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
283 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
284 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
289 set_recmode_fail_callback,
291 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
292 talloc_free(tmp_ctx);
299 data.dsize = sizeof(uint32_t);
300 data.dptr = (unsigned char *)&rec_mode;
302 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
308 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
309 talloc_free(tmp_ctx);
313 talloc_free(tmp_ctx);
318 change recovery master on all node
320 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
326 tmp_ctx = talloc_new(ctdb);
327 CTDB_NO_MEMORY(ctdb, tmp_ctx);
329 data.dsize = sizeof(uint32_t);
330 data.dptr = (unsigned char *)&pnn;
332 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
333 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
335 CONTROL_TIMEOUT(), false, data,
338 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
339 talloc_free(tmp_ctx);
343 talloc_free(tmp_ctx);
347 /* update all remote nodes to use the same db priority that we have
348 this can fail if the remove node has not yet been upgraded to
349 support this function, so we always return success and never fail
350 a recovery if this call fails.
352 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
353 struct ctdb_node_map *nodemap,
354 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
359 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
361 /* step through all local databases */
362 for (db=0; db<dbmap->num;db++) {
364 struct ctdb_db_priority db_prio;
367 db_prio.db_id = dbmap->dbs[db].dbid;
368 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
370 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
374 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
376 data.dptr = (uint8_t *)&db_prio;
377 data.dsize = sizeof(db_prio);
379 if (ctdb_client_async_control(ctdb,
380 CTDB_CONTROL_SET_DB_PRIORITY,
382 CONTROL_TIMEOUT(), false, data,
385 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
393 ensure all other nodes have attached to any databases that we have
395 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
396 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
399 struct ctdb_dbid_map *remote_dbmap;
401 /* verify that all other nodes have all our databases */
402 for (j=0; j<nodemap->num; j++) {
403 /* we dont need to ourself ourselves */
404 if (nodemap->nodes[j].pnn == pnn) {
407 /* dont check nodes that are unavailable */
408 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
412 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
413 mem_ctx, &remote_dbmap);
415 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
419 /* step through all local databases */
420 for (db=0; db<dbmap->num;db++) {
424 for (i=0;i<remote_dbmap->num;i++) {
425 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
429 /* the remote node already have this database */
430 if (i!=remote_dbmap->num) {
433 /* ok so we need to create this database */
434 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
437 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
440 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
441 mem_ctx, name, dbmap->dbs[db].persistent);
443 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
454 ensure we are attached to any databases that anyone else is attached to
456 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
457 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
460 struct ctdb_dbid_map *remote_dbmap;
462 /* verify that we have all database any other node has */
463 for (j=0; j<nodemap->num; j++) {
464 /* we dont need to ourself ourselves */
465 if (nodemap->nodes[j].pnn == pnn) {
468 /* dont check nodes that are unavailable */
469 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
473 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
474 mem_ctx, &remote_dbmap);
476 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
480 /* step through all databases on the remote node */
481 for (db=0; db<remote_dbmap->num;db++) {
484 for (i=0;i<(*dbmap)->num;i++) {
485 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
489 /* we already have this db locally */
490 if (i!=(*dbmap)->num) {
493 /* ok so we need to create this database and
496 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
497 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
499 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
500 nodemap->nodes[j].pnn));
503 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
504 remote_dbmap->dbs[db].persistent);
506 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
509 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
511 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
522 pull the remote database contents from one node into the recdb
524 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
525 struct tdb_wrap *recdb, uint32_t dbid,
530 struct ctdb_marshall_buffer *reply;
531 struct ctdb_rec_data *rec;
533 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
535 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
536 CONTROL_TIMEOUT(), &outdata);
538 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
539 talloc_free(tmp_ctx);
543 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
545 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
546 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
547 talloc_free(tmp_ctx);
551 rec = (struct ctdb_rec_data *)&reply->data[0];
555 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
557 struct ctdb_ltdb_header *hdr;
560 key.dptr = &rec->data[0];
561 key.dsize = rec->keylen;
562 data.dptr = &rec->data[key.dsize];
563 data.dsize = rec->datalen;
565 hdr = (struct ctdb_ltdb_header *)data.dptr;
567 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
568 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
569 talloc_free(tmp_ctx);
573 /* fetch the existing record, if any */
574 existing = tdb_fetch(recdb->tdb, key);
576 if (existing.dptr != NULL) {
577 struct ctdb_ltdb_header header;
578 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
579 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
580 (unsigned)existing.dsize, srcnode));
582 talloc_free(tmp_ctx);
585 header = *(struct ctdb_ltdb_header *)existing.dptr;
587 if (!(header.rsn < hdr->rsn ||
588 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
593 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
594 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
595 talloc_free(tmp_ctx);
600 talloc_free(tmp_ctx);
606 pull all the remote database contents into the recdb
608 static int pull_remote_database(struct ctdb_context *ctdb,
609 struct ctdb_recoverd *rec,
610 struct ctdb_node_map *nodemap,
611 struct tdb_wrap *recdb, uint32_t dbid,
616 /* pull all records from all other nodes across onto this node
617 (this merges based on rsn)
619 for (j=0; j<nodemap->num; j++) {
620 /* dont merge from nodes that are unavailable */
621 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
624 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
625 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
626 nodemap->nodes[j].pnn));
627 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
637 update flags on all active nodes
639 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
643 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
645 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
653 ensure all nodes have the same vnnmap we do
655 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
656 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
660 /* push the new vnn map out to all the nodes */
661 for (j=0; j<nodemap->num; j++) {
662 /* dont push to nodes that are unavailable */
663 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
667 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
669 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
679 struct vacuum_info *next, *prev;
680 struct ctdb_recoverd *rec;
682 struct ctdb_db_context *ctdb_db;
683 struct ctdb_marshall_buffer *recs;
684 struct ctdb_rec_data *r;
687 static void vacuum_fetch_next(struct vacuum_info *v);
690 called when a vacuum fetch has completed - just free it and do the next one
692 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
694 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
696 vacuum_fetch_next(v);
701 process the next element from the vacuum list
703 static void vacuum_fetch_next(struct vacuum_info *v)
705 struct ctdb_call call;
706 struct ctdb_rec_data *r;
708 while (v->recs->count) {
709 struct ctdb_client_call_state *state;
711 struct ctdb_ltdb_header *hdr;
714 call.call_id = CTDB_NULL_FUNC;
715 call.flags = CTDB_IMMEDIATE_MIGRATION;
718 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
721 call.key.dptr = &r->data[0];
722 call.key.dsize = r->keylen;
724 /* ensure we don't block this daemon - just skip a record if we can't get
726 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
730 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
731 if (data.dptr == NULL) {
732 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
736 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
738 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
742 hdr = (struct ctdb_ltdb_header *)data.dptr;
743 if (hdr->dmaster == v->rec->ctdb->pnn) {
744 /* its already local */
746 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
752 state = ctdb_call_send(v->ctdb_db, &call);
753 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
755 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
759 state->async.fn = vacuum_fetch_callback;
760 state->async.private_data = v;
769 destroy a vacuum info structure
771 static int vacuum_info_destructor(struct vacuum_info *v)
773 DLIST_REMOVE(v->rec->vacuum_info, v);
779 handler for vacuum fetch
781 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
782 TDB_DATA data, void *private_data)
784 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
785 struct ctdb_marshall_buffer *recs;
787 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
789 struct ctdb_dbid_map *dbmap=NULL;
790 bool persistent = false;
791 struct ctdb_db_context *ctdb_db;
792 struct ctdb_rec_data *r;
794 struct vacuum_info *v;
796 recs = (struct ctdb_marshall_buffer *)data.dptr;
797 r = (struct ctdb_rec_data *)&recs->data[0];
799 if (recs->count == 0) {
800 talloc_free(tmp_ctx);
806 for (v=rec->vacuum_info;v;v=v->next) {
807 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
808 /* we're already working on records from this node */
809 talloc_free(tmp_ctx);
814 /* work out if the database is persistent */
815 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
817 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
818 talloc_free(tmp_ctx);
822 for (i=0;i<dbmap->num;i++) {
823 if (dbmap->dbs[i].dbid == recs->db_id) {
824 persistent = dbmap->dbs[i].persistent;
828 if (i == dbmap->num) {
829 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
830 talloc_free(tmp_ctx);
834 /* find the name of this database */
835 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
836 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
837 talloc_free(tmp_ctx);
842 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
843 if (ctdb_db == NULL) {
844 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
845 talloc_free(tmp_ctx);
849 v = talloc_zero(rec, struct vacuum_info);
851 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
852 talloc_free(tmp_ctx);
857 v->srcnode = srcnode;
858 v->ctdb_db = ctdb_db;
859 v->recs = talloc_memdup(v, recs, data.dsize);
860 if (v->recs == NULL) {
861 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
863 talloc_free(tmp_ctx);
866 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
868 DLIST_ADD(rec->vacuum_info, v);
870 talloc_set_destructor(v, vacuum_info_destructor);
872 vacuum_fetch_next(v);
873 talloc_free(tmp_ctx);
878 called when ctdb_wait_timeout should finish
880 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
881 struct timeval yt, void *p)
883 uint32_t *timed_out = (uint32_t *)p;
888 wait for a given number of seconds
890 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
892 uint32_t timed_out = 0;
893 time_t usecs = (secs - (time_t)secs) * 1000000;
894 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
896 event_loop_once(ctdb->ev);
901 called when an election times out (ends)
903 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
904 struct timeval t, void *p)
906 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
907 rec->election_timeout = NULL;
910 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
915 wait for an election to finish. It finished election_timeout seconds after
916 the last election packet is received
918 static void ctdb_wait_election(struct ctdb_recoverd *rec)
920 struct ctdb_context *ctdb = rec->ctdb;
921 while (rec->election_timeout) {
922 event_loop_once(ctdb->ev);
927 Update our local flags from all remote connected nodes.
928 This is only run when we are or we belive we are the recovery master
930 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
933 struct ctdb_context *ctdb = rec->ctdb;
934 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
936 /* get the nodemap for all active remote nodes and verify
937 they are the same as for this node
939 for (j=0; j<nodemap->num; j++) {
940 struct ctdb_node_map *remote_nodemap=NULL;
943 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
946 if (nodemap->nodes[j].pnn == ctdb->pnn) {
950 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
951 mem_ctx, &remote_nodemap);
953 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
954 nodemap->nodes[j].pnn));
955 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
956 talloc_free(mem_ctx);
957 return MONITOR_FAILED;
959 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
960 /* We should tell our daemon about this so it
961 updates its flags or else we will log the same
962 message again in the next iteration of recovery.
963 Since we are the recovery master we can just as
964 well update the flags on all nodes.
966 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
968 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
972 /* Update our local copy of the flags in the recovery
975 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
976 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
977 nodemap->nodes[j].flags));
978 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
980 talloc_free(remote_nodemap);
982 talloc_free(mem_ctx);
987 /* Create a new random generation ip.
988 The generation id can not be the INVALID_GENERATION id
990 static uint32_t new_generation(void)
995 generation = random();
997 if (generation != INVALID_GENERATION) {
1007 create a temporary working database
1009 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1012 struct tdb_wrap *recdb;
1015 /* open up the temporary recovery database */
1016 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1017 ctdb->db_directory_state,
1024 tdb_flags = TDB_NOLOCK;
1025 if (ctdb->valgrinding) {
1026 tdb_flags |= TDB_NOMMAP;
1028 tdb_flags |= TDB_DISALLOW_NESTING;
1030 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1031 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1032 if (recdb == NULL) {
1033 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1043 a traverse function for pulling all relevent records from recdb
1046 struct ctdb_context *ctdb;
1047 struct ctdb_marshall_buffer *recdata;
1053 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1055 struct recdb_data *params = (struct recdb_data *)p;
1056 struct ctdb_rec_data *rec;
1057 struct ctdb_ltdb_header *hdr;
1059 /* skip empty records */
1060 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1064 /* update the dmaster field to point to us */
1065 hdr = (struct ctdb_ltdb_header *)data.dptr;
1066 if (!params->persistent) {
1067 hdr->dmaster = params->ctdb->pnn;
1070 /* add the record to the blob ready to send to the nodes */
1071 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1073 params->failed = true;
1076 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1077 if (params->recdata == NULL) {
1078 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1079 rec->length + params->len, params->recdata->count));
1080 params->failed = true;
1083 params->recdata->count++;
1084 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1085 params->len += rec->length;
1092 push the recdb database out to all nodes
1094 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1096 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1098 struct recdb_data params;
1099 struct ctdb_marshall_buffer *recdata;
1101 TALLOC_CTX *tmp_ctx;
1104 tmp_ctx = talloc_new(ctdb);
1105 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1107 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1108 CTDB_NO_MEMORY(ctdb, recdata);
1110 recdata->db_id = dbid;
1113 params.recdata = recdata;
1114 params.len = offsetof(struct ctdb_marshall_buffer, data);
1115 params.failed = false;
1116 params.persistent = persistent;
1118 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1119 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1120 talloc_free(params.recdata);
1121 talloc_free(tmp_ctx);
1125 if (params.failed) {
1126 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1127 talloc_free(params.recdata);
1128 talloc_free(tmp_ctx);
1132 recdata = params.recdata;
1134 outdata.dptr = (void *)recdata;
1135 outdata.dsize = params.len;
1137 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1138 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1140 CONTROL_TIMEOUT(), false, outdata,
1143 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1144 talloc_free(recdata);
1145 talloc_free(tmp_ctx);
1149 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1150 dbid, recdata->count));
1152 talloc_free(recdata);
1153 talloc_free(tmp_ctx);
1160 go through a full recovery on one database
1162 static int recover_database(struct ctdb_recoverd *rec,
1163 TALLOC_CTX *mem_ctx,
1167 struct ctdb_node_map *nodemap,
1168 uint32_t transaction_id)
1170 struct tdb_wrap *recdb;
1172 struct ctdb_context *ctdb = rec->ctdb;
1174 struct ctdb_control_wipe_database w;
1177 recdb = create_recdb(ctdb, mem_ctx);
1178 if (recdb == NULL) {
1182 /* pull all remote databases onto the recdb */
1183 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1185 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1189 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1191 /* wipe all the remote databases. This is safe as we are in a transaction */
1193 w.transaction_id = transaction_id;
1195 data.dptr = (void *)&w;
1196 data.dsize = sizeof(w);
1198 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1199 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1201 CONTROL_TIMEOUT(), false, data,
1204 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1209 /* push out the correct database. This sets the dmaster and skips
1210 the empty records */
1211 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1217 /* all done with this database */
1224 reload the nodes file
1226 static void reload_nodes_file(struct ctdb_context *ctdb)
1229 ctdb_load_nodes_file(ctdb);
1232 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1233 struct ctdb_recoverd *rec,
1234 struct ctdb_node_map *nodemap,
1240 if (ctdb->num_nodes != nodemap->num) {
1241 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1242 ctdb->num_nodes, nodemap->num));
1244 *culprit = ctdb->pnn;
1249 for (j=0; j<nodemap->num; j++) {
1250 /* release any existing data */
1251 if (ctdb->nodes[j]->known_public_ips) {
1252 talloc_free(ctdb->nodes[j]->known_public_ips);
1253 ctdb->nodes[j]->known_public_ips = NULL;
1255 if (ctdb->nodes[j]->available_public_ips) {
1256 talloc_free(ctdb->nodes[j]->available_public_ips);
1257 ctdb->nodes[j]->available_public_ips = NULL;
1260 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1264 /* grab a new shiny list of public ips from the node */
1265 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1267 ctdb->nodes[j]->pnn,
1270 &ctdb->nodes[j]->known_public_ips);
1272 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1273 ctdb->nodes[j]->pnn));
1275 *culprit = ctdb->nodes[j]->pnn;
1280 if (ctdb->tunable.disable_ip_failover == 0) {
1281 if (rec->ip_check_disable_ctx == NULL) {
1282 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1283 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1284 rec->need_takeover_run = true;
1289 /* grab a new shiny list of public ips from the node */
1290 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1292 ctdb->nodes[j]->pnn,
1294 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1295 &ctdb->nodes[j]->available_public_ips);
1297 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1298 ctdb->nodes[j]->pnn));
1300 *culprit = ctdb->nodes[j]->pnn;
1309 /* when we start a recovery, make sure all nodes use the same reclock file
1312 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1314 struct ctdb_context *ctdb = rec->ctdb;
1315 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1319 if (ctdb->recovery_lock_file == NULL) {
1323 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1324 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1327 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1328 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1334 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1335 talloc_free(tmp_ctx);
1339 talloc_free(tmp_ctx);
1345 we are the recmaster, and recovery is needed - start a recovery run
1347 static int do_recovery(struct ctdb_recoverd *rec,
1348 TALLOC_CTX *mem_ctx, uint32_t pnn,
1349 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1351 struct ctdb_context *ctdb = rec->ctdb;
1353 uint32_t generation;
1354 struct ctdb_dbid_map *dbmap;
1357 struct timeval start_time;
1358 uint32_t culprit = (uint32_t)-1;
1360 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1362 /* if recovery fails, force it again */
1363 rec->need_recovery = true;
1365 for (i=0; i<ctdb->num_nodes; i++) {
1366 struct ctdb_banning_state *ban_state;
1368 if (ctdb->nodes[i]->ban_state == NULL) {
1371 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1372 if (ban_state->count < 2*ctdb->num_nodes) {
1375 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1376 ctdb->nodes[i]->pnn, ban_state->count,
1377 ctdb->tunable.recovery_ban_period));
1378 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1379 ban_state->count = 0;
1383 if (ctdb->tunable.verify_recovery_lock != 0) {
1384 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1385 start_time = timeval_current();
1386 if (!ctdb_recovery_lock(ctdb, true)) {
1387 ctdb_set_culprit(rec, pnn);
1388 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1391 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1392 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1395 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1397 /* get a list of all databases */
1398 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1400 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1404 /* we do the db creation before we set the recovery mode, so the freeze happens
1405 on all databases we will be dealing with. */
1407 /* verify that we have all the databases any other node has */
1408 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1410 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1414 /* verify that all other nodes have all our databases */
1415 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1417 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1420 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1422 /* update the database priority for all remote databases */
1423 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1425 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1427 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1430 /* update all other nodes to use the same setting for reclock files
1431 as the local recovery master.
1433 sync_recovery_lock_file_across_cluster(rec);
1435 /* set recovery mode to active on all nodes */
1436 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1438 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1442 /* execute the "startrecovery" event script on all nodes */
1443 ret = run_startrecovery_eventscript(rec, nodemap);
1445 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1450 update all nodes to have the same flags that we have
1452 for (i=0;i<nodemap->num;i++) {
1453 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1457 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1459 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1464 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1466 /* pick a new generation number */
1467 generation = new_generation();
1469 /* change the vnnmap on this node to use the new generation
1470 number but not on any other nodes.
1471 this guarantees that if we abort the recovery prematurely
1472 for some reason (a node stops responding?)
1473 that we can just return immediately and we will reenter
1474 recovery shortly again.
1475 I.e. we deliberately leave the cluster with an inconsistent
1476 generation id to allow us to abort recovery at any stage and
1477 just restart it from scratch.
1479 vnnmap->generation = generation;
1480 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1482 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1486 data.dptr = (void *)&generation;
1487 data.dsize = sizeof(uint32_t);
1489 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1490 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1492 CONTROL_TIMEOUT(), false, data,
1494 transaction_start_fail_callback,
1496 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1499 CONTROL_TIMEOUT(), false, tdb_null,
1503 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1508 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1510 for (i=0;i<dbmap->num;i++) {
1511 ret = recover_database(rec, mem_ctx,
1513 dbmap->dbs[i].persistent,
1514 pnn, nodemap, generation);
1516 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1521 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1523 /* commit all the changes */
1524 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1526 CONTROL_TIMEOUT(), false, data,
1529 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1533 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1536 /* update the capabilities for all nodes */
1537 ret = update_capabilities(ctdb, nodemap);
1539 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1543 /* build a new vnn map with all the currently active and
1545 generation = new_generation();
1546 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1547 CTDB_NO_MEMORY(ctdb, vnnmap);
1548 vnnmap->generation = generation;
1550 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1551 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1552 for (i=j=0;i<nodemap->num;i++) {
1553 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1556 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1557 /* this node can not be an lmaster */
1558 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1563 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1564 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1565 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1568 if (vnnmap->size == 0) {
1569 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1571 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1572 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1573 vnnmap->map[0] = pnn;
1576 /* update to the new vnnmap on all nodes */
1577 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1579 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1583 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1585 /* update recmaster to point to us for all nodes */
1586 ret = set_recovery_master(ctdb, nodemap, pnn);
1588 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1592 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1595 update all nodes to have the same flags that we have
1597 for (i=0;i<nodemap->num;i++) {
1598 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1602 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1604 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1609 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1611 /* disable recovery mode */
1612 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1614 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1618 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1621 tell nodes to takeover their public IPs
1623 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1625 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1627 rec->need_takeover_run = true;
1630 rec->need_takeover_run = false;
1631 ret = ctdb_takeover_run(ctdb, nodemap);
1633 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1634 rec->need_takeover_run = true;
1636 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1638 /* execute the "recovered" event script on all nodes */
1639 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1641 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1645 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1647 /* send a message to all clients telling them that the cluster
1648 has been reconfigured */
1649 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1651 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1653 rec->need_recovery = false;
1655 /* we managed to complete a full recovery, make sure to forgive
1656 any past sins by the nodes that could now participate in the
1659 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1660 for (i=0;i<nodemap->num;i++) {
1661 struct ctdb_banning_state *ban_state;
1663 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1667 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1668 if (ban_state == NULL) {
1672 ban_state->count = 0;
1676 /* We just finished a recovery successfully.
1677 We now wait for rerecovery_timeout before we allow
1678 another recovery to take place.
1680 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1681 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1682 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1689 elections are won by first checking the number of connected nodes, then
1690 the priority time, then the pnn
1692 struct election_message {
1693 uint32_t num_connected;
1694 struct timeval priority_time;
1696 uint32_t node_flags;
1700 form this nodes election data
1702 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1705 struct ctdb_node_map *nodemap;
1706 struct ctdb_context *ctdb = rec->ctdb;
1710 em->pnn = rec->ctdb->pnn;
1711 em->priority_time = rec->priority_time;
1713 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1715 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1719 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1720 em->node_flags = rec->node_flags;
1722 for (i=0;i<nodemap->num;i++) {
1723 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1724 em->num_connected++;
1728 /* we shouldnt try to win this election if we cant be a recmaster */
1729 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1730 em->num_connected = 0;
1731 em->priority_time = timeval_current();
1734 talloc_free(nodemap);
1738 see if the given election data wins
1740 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1742 struct election_message myem;
1745 ctdb_election_data(rec, &myem);
1747 /* we cant win if we dont have the recmaster capability */
1748 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1752 /* we cant win if we are banned */
1753 if (rec->node_flags & NODE_FLAGS_BANNED) {
1757 /* we cant win if we are stopped */
1758 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1762 /* we will automatically win if the other node is banned */
1763 if (em->node_flags & NODE_FLAGS_BANNED) {
1767 /* we will automatically win if the other node is banned */
1768 if (em->node_flags & NODE_FLAGS_STOPPED) {
1772 /* try to use the most connected node */
1774 cmp = (int)myem.num_connected - (int)em->num_connected;
1777 /* then the longest running node */
1779 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1783 cmp = (int)myem.pnn - (int)em->pnn;
1790 send out an election request
1792 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1795 TDB_DATA election_data;
1796 struct election_message emsg;
1798 struct ctdb_context *ctdb = rec->ctdb;
1800 srvid = CTDB_SRVID_RECOVERY;
1802 ctdb_election_data(rec, &emsg);
1804 election_data.dsize = sizeof(struct election_message);
1805 election_data.dptr = (unsigned char *)&emsg;
1808 /* send an election message to all active nodes */
1809 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1810 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1813 /* A new node that is already frozen has entered the cluster.
1814 The existing nodes are not frozen and dont need to be frozen
1815 until the election has ended and we start the actual recovery
1817 if (update_recmaster == true) {
1818 /* first we assume we will win the election and set
1819 recoverymaster to be ourself on the current node
1821 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1823 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1833 this function will unban all nodes in the cluster
1835 static void unban_all_nodes(struct ctdb_context *ctdb)
1838 struct ctdb_node_map *nodemap;
1839 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1841 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1843 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1847 for (i=0;i<nodemap->num;i++) {
1848 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1849 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1850 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1854 talloc_free(tmp_ctx);
1859 we think we are winning the election - send a broadcast election request
1861 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1863 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1866 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1868 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1871 talloc_free(rec->send_election_te);
1872 rec->send_election_te = NULL;
1876 handler for memory dumps
1878 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1879 TDB_DATA data, void *private_data)
1881 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1884 struct rd_memdump_reply *rd;
1886 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1887 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1888 talloc_free(tmp_ctx);
1891 rd = (struct rd_memdump_reply *)data.dptr;
1893 dump = talloc_zero(tmp_ctx, TDB_DATA);
1895 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1896 talloc_free(tmp_ctx);
1899 ret = ctdb_dump_memory(ctdb, dump);
1901 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1902 talloc_free(tmp_ctx);
1906 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1908 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1910 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1911 talloc_free(tmp_ctx);
1915 talloc_free(tmp_ctx);
1919 handler for reload_nodes
1921 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1922 TDB_DATA data, void *private_data)
1924 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1926 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1928 reload_nodes_file(rec->ctdb);
1932 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1933 struct timeval yt, void *p)
1935 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1937 talloc_free(rec->ip_check_disable_ctx);
1938 rec->ip_check_disable_ctx = NULL;
1942 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1943 TDB_DATA data, void *private_data)
1945 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1946 struct ctdb_public_ip *ip;
1948 if (rec->recmaster != rec->ctdb->pnn) {
1949 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
1953 if (data.dsize != sizeof(struct ctdb_public_ip)) {
1954 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
1958 ip = (struct ctdb_public_ip *)data.dptr;
1960 update_ip_assignment_tree(rec->ctdb, ip);
1964 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1965 TDB_DATA data, void *private_data)
1967 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1970 if (rec->ip_check_disable_ctx != NULL) {
1971 talloc_free(rec->ip_check_disable_ctx);
1972 rec->ip_check_disable_ctx = NULL;
1975 if (data.dsize != sizeof(uint32_t)) {
1976 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1977 "expexting %lu\n", (long unsigned)data.dsize,
1978 (long unsigned)sizeof(uint32_t)));
1981 if (data.dptr == NULL) {
1982 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1986 timeout = *((uint32_t *)data.dptr);
1987 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1989 rec->ip_check_disable_ctx = talloc_new(rec);
1990 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1992 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1997 handler for ip reallocate, just add it to the list of callers and
1998 handle this later in the monitor_cluster loop so we do not recurse
1999 with other callers to takeover_run()
2001 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2002 TDB_DATA data, void *private_data)
2004 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2005 struct ip_reallocate_list *caller;
2007 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2008 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2012 if (rec->ip_reallocate_ctx == NULL) {
2013 rec->ip_reallocate_ctx = talloc_new(rec);
2014 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2017 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2018 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2020 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2021 caller->next = rec->reallocate_callers;
2022 rec->reallocate_callers = caller;
2027 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2029 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2032 struct ip_reallocate_list *callers;
2035 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2037 /* update the list of public ips that a node can handle for
2040 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2042 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2044 rec->need_takeover_run = true;
2047 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2049 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2050 rec->need_takeover_run = true;
2054 result.dsize = sizeof(int32_t);
2055 result.dptr = (uint8_t *)&ret;
2057 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2059 /* Someone that sent srvid==0 does not want a reply */
2060 if (callers->rd->srvid == 0) {
2063 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2064 "%u:%llu\n", (unsigned)callers->rd->pnn,
2065 (unsigned long long)callers->rd->srvid));
2066 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2068 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2069 "message to %u:%llu\n",
2070 (unsigned)callers->rd->pnn,
2071 (unsigned long long)callers->rd->srvid));
2075 talloc_free(tmp_ctx);
2076 talloc_free(rec->ip_reallocate_ctx);
2077 rec->ip_reallocate_ctx = NULL;
2078 rec->reallocate_callers = NULL;
2084 handler for recovery master elections
2086 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2087 TDB_DATA data, void *private_data)
2089 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2091 struct election_message *em = (struct election_message *)data.dptr;
2092 TALLOC_CTX *mem_ctx;
2094 /* we got an election packet - update the timeout for the election */
2095 talloc_free(rec->election_timeout);
2096 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2098 timeval_current_ofs(0, 500000) :
2099 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2100 ctdb_election_timeout, rec);
2102 mem_ctx = talloc_new(ctdb);
2104 /* someone called an election. check their election data
2105 and if we disagree and we would rather be the elected node,
2106 send a new election message to all other nodes
2108 if (ctdb_election_win(rec, em)) {
2109 if (!rec->send_election_te) {
2110 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2111 timeval_current_ofs(0, 500000),
2112 election_send_request, rec);
2114 talloc_free(mem_ctx);
2115 /*unban_all_nodes(ctdb);*/
2120 talloc_free(rec->send_election_te);
2121 rec->send_election_te = NULL;
2123 if (ctdb->tunable.verify_recovery_lock != 0) {
2124 /* release the recmaster lock */
2125 if (em->pnn != ctdb->pnn &&
2126 ctdb->recovery_lock_fd != -1) {
2127 close(ctdb->recovery_lock_fd);
2128 ctdb->recovery_lock_fd = -1;
2129 unban_all_nodes(ctdb);
2133 /* ok, let that guy become recmaster then */
2134 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2136 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2137 talloc_free(mem_ctx);
2141 talloc_free(mem_ctx);
2147 force the start of the election process
2149 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2150 struct ctdb_node_map *nodemap)
2153 struct ctdb_context *ctdb = rec->ctdb;
2155 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2157 /* set all nodes to recovery mode to stop all internode traffic */
2158 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2160 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2164 talloc_free(rec->election_timeout);
2165 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2167 timeval_current_ofs(0, 500000) :
2168 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2169 ctdb_election_timeout, rec);
2171 ret = send_election_request(rec, pnn, true);
2173 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2177 /* wait for a few seconds to collect all responses */
2178 ctdb_wait_election(rec);
2184 handler for when a node changes its flags
2186 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2187 TDB_DATA data, void *private_data)
2190 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2191 struct ctdb_node_map *nodemap=NULL;
2192 TALLOC_CTX *tmp_ctx;
2193 uint32_t changed_flags;
2195 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2196 int disabled_flag_changed;
2198 if (data.dsize != sizeof(*c)) {
2199 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2203 tmp_ctx = talloc_new(ctdb);
2204 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2206 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2208 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2209 talloc_free(tmp_ctx);
2214 for (i=0;i<nodemap->num;i++) {
2215 if (nodemap->nodes[i].pnn == c->pnn) break;
2218 if (i == nodemap->num) {
2219 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2220 talloc_free(tmp_ctx);
2224 changed_flags = c->old_flags ^ c->new_flags;
2226 if (nodemap->nodes[i].flags != c->new_flags) {
2227 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2230 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2232 nodemap->nodes[i].flags = c->new_flags;
2234 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2235 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2238 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2239 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2243 ctdb->recovery_master == ctdb->pnn &&
2244 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2245 /* Only do the takeover run if the perm disabled or unhealthy
2246 flags changed since these will cause an ip failover but not
2248 If the node became disconnected or banned this will also
2249 lead to an ip address failover but that is handled
2252 if (disabled_flag_changed) {
2253 rec->need_takeover_run = true;
2257 talloc_free(tmp_ctx);
2261 handler for when we need to push out flag changes ot all other nodes
2263 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2264 TDB_DATA data, void *private_data)
2267 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2268 struct ctdb_node_map *nodemap=NULL;
2269 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2273 /* find the recovery master */
2274 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2276 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2277 talloc_free(tmp_ctx);
2281 /* read the node flags from the recmaster */
2282 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2284 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2285 talloc_free(tmp_ctx);
2288 if (c->pnn >= nodemap->num) {
2289 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2290 talloc_free(tmp_ctx);
2294 /* send the flags update to all connected nodes */
2295 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2297 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2298 nodes, 0, CONTROL_TIMEOUT(),
2302 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2304 talloc_free(tmp_ctx);
2308 talloc_free(tmp_ctx);
2312 struct verify_recmode_normal_data {
2314 enum monitor_result status;
2317 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2319 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2322 /* one more node has responded with recmode data*/
2325 /* if we failed to get the recmode, then return an error and let
2326 the main loop try again.
2328 if (state->state != CTDB_CONTROL_DONE) {
2329 if (rmdata->status == MONITOR_OK) {
2330 rmdata->status = MONITOR_FAILED;
2335 /* if we got a response, then the recmode will be stored in the
2338 if (state->status != CTDB_RECOVERY_NORMAL) {
2339 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2340 rmdata->status = MONITOR_RECOVERY_NEEDED;
2347 /* verify that all nodes are in normal recovery mode */
2348 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2350 struct verify_recmode_normal_data *rmdata;
2351 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2352 struct ctdb_client_control_state *state;
2353 enum monitor_result status;
2356 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2357 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2359 rmdata->status = MONITOR_OK;
2361 /* loop over all active nodes and send an async getrecmode call to
2363 for (j=0; j<nodemap->num; j++) {
2364 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2367 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2369 nodemap->nodes[j].pnn);
2370 if (state == NULL) {
2371 /* we failed to send the control, treat this as
2372 an error and try again next iteration
2374 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2375 talloc_free(mem_ctx);
2376 return MONITOR_FAILED;
2379 /* set up the callback functions */
2380 state->async.fn = verify_recmode_normal_callback;
2381 state->async.private_data = rmdata;
2383 /* one more control to wait for to complete */
2388 /* now wait for up to the maximum number of seconds allowed
2389 or until all nodes we expect a response from has replied
2391 while (rmdata->count > 0) {
2392 event_loop_once(ctdb->ev);
2395 status = rmdata->status;
2396 talloc_free(mem_ctx);
2401 struct verify_recmaster_data {
2402 struct ctdb_recoverd *rec;
2405 enum monitor_result status;
2408 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2410 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2413 /* one more node has responded with recmaster data*/
2416 /* if we failed to get the recmaster, then return an error and let
2417 the main loop try again.
2419 if (state->state != CTDB_CONTROL_DONE) {
2420 if (rmdata->status == MONITOR_OK) {
2421 rmdata->status = MONITOR_FAILED;
2426 /* if we got a response, then the recmaster will be stored in the
2429 if (state->status != rmdata->pnn) {
2430 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2431 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2432 rmdata->status = MONITOR_ELECTION_NEEDED;
2439 /* verify that all nodes agree that we are the recmaster */
2440 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2442 struct ctdb_context *ctdb = rec->ctdb;
2443 struct verify_recmaster_data *rmdata;
2444 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2445 struct ctdb_client_control_state *state;
2446 enum monitor_result status;
2449 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2450 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2454 rmdata->status = MONITOR_OK;
2456 /* loop over all active nodes and send an async getrecmaster call to
2458 for (j=0; j<nodemap->num; j++) {
2459 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2462 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2464 nodemap->nodes[j].pnn);
2465 if (state == NULL) {
2466 /* we failed to send the control, treat this as
2467 an error and try again next iteration
2469 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2470 talloc_free(mem_ctx);
2471 return MONITOR_FAILED;
2474 /* set up the callback functions */
2475 state->async.fn = verify_recmaster_callback;
2476 state->async.private_data = rmdata;
2478 /* one more control to wait for to complete */
2483 /* now wait for up to the maximum number of seconds allowed
2484 or until all nodes we expect a response from has replied
2486 while (rmdata->count > 0) {
2487 event_loop_once(ctdb->ev);
2490 status = rmdata->status;
2491 talloc_free(mem_ctx);
2496 /* called to check that the local allocation of public ip addresses is ok.
2498 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2500 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2501 struct ctdb_control_get_ifaces *ifaces = NULL;
2502 struct ctdb_all_public_ips *ips = NULL;
2503 struct ctdb_uptime *uptime1 = NULL;
2504 struct ctdb_uptime *uptime2 = NULL;
2506 bool need_iface_check = false;
2507 bool need_takeover_run = false;
2509 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2510 CTDB_CURRENT_NODE, &uptime1);
2512 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2513 talloc_free(mem_ctx);
2518 /* read the interfaces from the local node */
2519 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2521 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2522 talloc_free(mem_ctx);
2527 need_iface_check = true;
2528 } else if (rec->ifaces->num != ifaces->num) {
2529 need_iface_check = true;
2530 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2531 need_iface_check = true;
2534 if (need_iface_check) {
2535 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2536 "local node %u - force takeover run\n",
2538 need_takeover_run = true;
2541 /* read the ip allocation from the local node */
2542 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2544 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2545 talloc_free(mem_ctx);
2549 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2550 CTDB_CURRENT_NODE, &uptime2);
2552 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2553 talloc_free(mem_ctx);
2557 /* skip the check if the startrecovery time has changed */
2558 if (timeval_compare(&uptime1->last_recovery_started,
2559 &uptime2->last_recovery_started) != 0) {
2560 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2561 talloc_free(mem_ctx);
2565 /* skip the check if the endrecovery time has changed */
2566 if (timeval_compare(&uptime1->last_recovery_finished,
2567 &uptime2->last_recovery_finished) != 0) {
2568 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2569 talloc_free(mem_ctx);
2573 /* skip the check if we have started but not finished recovery */
2574 if (timeval_compare(&uptime1->last_recovery_finished,
2575 &uptime1->last_recovery_started) != 1) {
2576 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2577 talloc_free(mem_ctx);
2582 talloc_free(rec->ifaces);
2583 rec->ifaces = talloc_steal(rec, ifaces);
2585 /* verify that we have the ip addresses we should have
2586 and we dont have ones we shouldnt have.
2587 if we find an inconsistency we set recmode to
2588 active on the local node and wait for the recmaster
2589 to do a full blown recovery.
2590 also if the pnn is -1 and we are healthy and can host the ip
2591 we also request a ip reallocation.
2593 if (ctdb->tunable.disable_ip_failover == 0) {
2594 for (j=0; j<ips->num; j++) {
2595 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2596 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2597 ctdb_addr_to_str(&ips->ips[j].addr)));
2598 need_takeover_run = true;
2599 } else if (ips->ips[j].pnn == pnn) {
2600 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2601 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2602 ctdb_addr_to_str(&ips->ips[j].addr)));
2603 need_takeover_run = true;
2606 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2607 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2608 ctdb_addr_to_str(&ips->ips[j].addr)));
2609 need_takeover_run = true;
2615 if (need_takeover_run) {
2616 struct takeover_run_reply rd;
2619 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2623 data.dptr = (uint8_t *)&rd;
2624 data.dsize = sizeof(rd);
2626 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2628 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2631 talloc_free(mem_ctx);
2636 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2638 struct ctdb_node_map **remote_nodemaps = callback_data;
2640 if (node_pnn >= ctdb->num_nodes) {
2641 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2645 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2649 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2650 struct ctdb_node_map *nodemap,
2651 struct ctdb_node_map **remote_nodemaps)
2655 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2656 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2658 CONTROL_TIMEOUT(), false, tdb_null,
2659 async_getnodemap_callback,
2661 remote_nodemaps) != 0) {
2662 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2670 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2671 struct ctdb_check_reclock_state {
2672 struct ctdb_context *ctdb;
2673 struct timeval start_time;
2676 struct timed_event *te;
2677 struct fd_event *fde;
2678 enum reclock_child_status status;
2681 /* when we free the reclock state we must kill any child process.
2683 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2685 struct ctdb_context *ctdb = state->ctdb;
2687 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2689 if (state->fd[0] != -1) {
2690 close(state->fd[0]);
2693 if (state->fd[1] != -1) {
2694 close(state->fd[1]);
2697 kill(state->child, SIGKILL);
2702 called if our check_reclock child times out. this would happen if
2703 i/o to the reclock file blocks.
2705 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2706 struct timeval t, void *private_data)
2708 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2709 struct ctdb_check_reclock_state);
2711 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2712 state->status = RECLOCK_TIMEOUT;
2715 /* this is called when the child process has completed checking the reclock
2716 file and has written data back to us through the pipe.
2718 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2719 uint16_t flags, void *private_data)
2721 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2722 struct ctdb_check_reclock_state);
2726 /* we got a response from our child process so we can abort the
2729 talloc_free(state->te);
2732 ret = read(state->fd[0], &c, 1);
2733 if (ret != 1 || c != RECLOCK_OK) {
2734 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2735 state->status = RECLOCK_FAILED;
2740 state->status = RECLOCK_OK;
2744 static int check_recovery_lock(struct ctdb_context *ctdb)
2747 struct ctdb_check_reclock_state *state;
2748 pid_t parent = getpid();
2750 if (ctdb->recovery_lock_fd == -1) {
2751 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2755 state = talloc(ctdb, struct ctdb_check_reclock_state);
2756 CTDB_NO_MEMORY(ctdb, state);
2759 state->start_time = timeval_current();
2760 state->status = RECLOCK_CHECKING;
2764 ret = pipe(state->fd);
2767 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2771 state->child = ctdb_fork(ctdb);
2772 if (state->child == (pid_t)-1) {
2773 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2774 close(state->fd[0]);
2776 close(state->fd[1]);
2782 if (state->child == 0) {
2783 char cc = RECLOCK_OK;
2784 close(state->fd[0]);
2787 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
2788 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2789 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2790 cc = RECLOCK_FAILED;
2793 write(state->fd[1], &cc, 1);
2794 /* make sure we die when our parent dies */
2795 while (kill(parent, 0) == 0 || errno != ESRCH) {
2797 write(state->fd[1], &cc, 1);
2801 close(state->fd[1]);
2803 set_close_on_exec(state->fd[0]);
2805 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2807 talloc_set_destructor(state, check_reclock_destructor);
2809 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2810 ctdb_check_reclock_timeout, state);
2811 if (state->te == NULL) {
2812 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2817 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2819 reclock_child_handler,
2822 if (state->fde == NULL) {
2823 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2827 tevent_fd_set_auto_close(state->fde);
2829 while (state->status == RECLOCK_CHECKING) {
2830 event_loop_once(ctdb->ev);
2833 if (state->status == RECLOCK_FAILED) {
2834 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2835 close(ctdb->recovery_lock_fd);
2836 ctdb->recovery_lock_fd = -1;
2845 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2847 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2848 const char *reclockfile;
2850 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2851 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2852 talloc_free(tmp_ctx);
2856 if (reclockfile == NULL) {
2857 if (ctdb->recovery_lock_file != NULL) {
2858 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2859 talloc_free(ctdb->recovery_lock_file);
2860 ctdb->recovery_lock_file = NULL;
2861 if (ctdb->recovery_lock_fd != -1) {
2862 close(ctdb->recovery_lock_fd);
2863 ctdb->recovery_lock_fd = -1;
2866 ctdb->tunable.verify_recovery_lock = 0;
2867 talloc_free(tmp_ctx);
2871 if (ctdb->recovery_lock_file == NULL) {
2872 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2873 if (ctdb->recovery_lock_fd != -1) {
2874 close(ctdb->recovery_lock_fd);
2875 ctdb->recovery_lock_fd = -1;
2877 talloc_free(tmp_ctx);
2882 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2883 talloc_free(tmp_ctx);
2887 talloc_free(ctdb->recovery_lock_file);
2888 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2889 ctdb->tunable.verify_recovery_lock = 0;
2890 if (ctdb->recovery_lock_fd != -1) {
2891 close(ctdb->recovery_lock_fd);
2892 ctdb->recovery_lock_fd = -1;
2895 talloc_free(tmp_ctx);
2899 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2900 TALLOC_CTX *mem_ctx)
2903 struct ctdb_node_map *nodemap=NULL;
2904 struct ctdb_node_map *recmaster_nodemap=NULL;
2905 struct ctdb_node_map **remote_nodemaps=NULL;
2906 struct ctdb_vnn_map *vnnmap=NULL;
2907 struct ctdb_vnn_map *remote_vnnmap=NULL;
2908 int32_t debug_level;
2913 /* verify that the main daemon is still running */
2914 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2915 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2919 /* ping the local daemon to tell it we are alive */
2920 ctdb_ctrl_recd_ping(ctdb);
2922 if (rec->election_timeout) {
2923 /* an election is in progress */
2927 /* read the debug level from the parent and update locally */
2928 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2930 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2933 LogLevel = debug_level;
2936 /* We must check if we need to ban a node here but we want to do this
2937 as early as possible so we dont wait until we have pulled the node
2938 map from the local node. thats why we have the hardcoded value 20
2940 for (i=0; i<ctdb->num_nodes; i++) {
2941 struct ctdb_banning_state *ban_state;
2943 if (ctdb->nodes[i]->ban_state == NULL) {
2946 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2947 if (ban_state->count < 20) {
2950 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2951 ctdb->nodes[i]->pnn, ban_state->count,
2952 ctdb->tunable.recovery_ban_period));
2953 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2954 ban_state->count = 0;
2957 /* get relevant tunables */
2958 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2960 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2964 /* get the current recovery lock file from the server */
2965 if (update_recovery_lock_file(ctdb) != 0) {
2966 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2970 /* Make sure that if recovery lock verification becomes disabled when
2973 if (ctdb->tunable.verify_recovery_lock == 0) {
2974 if (ctdb->recovery_lock_fd != -1) {
2975 close(ctdb->recovery_lock_fd);
2976 ctdb->recovery_lock_fd = -1;
2980 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2981 if (pnn == (uint32_t)-1) {
2982 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2986 /* get the vnnmap */
2987 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2989 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2994 /* get number of nodes */
2996 talloc_free(rec->nodemap);
2997 rec->nodemap = NULL;
3000 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3002 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3005 nodemap = rec->nodemap;
3007 /* check which node is the recovery master */
3008 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3010 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3014 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3015 if (rec->recmaster != pnn) {
3016 if (rec->ip_reallocate_ctx != NULL) {
3017 talloc_free(rec->ip_reallocate_ctx);
3018 rec->ip_reallocate_ctx = NULL;
3019 rec->reallocate_callers = NULL;
3022 /* if there are takeovers requested, perform it and notify the waiters */
3023 if (rec->reallocate_callers) {
3024 process_ipreallocate_requests(ctdb, rec);
3027 if (rec->recmaster == (uint32_t)-1) {
3028 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3029 force_election(rec, pnn, nodemap);
3034 /* if the local daemon is STOPPED, we verify that the databases are
3035 also frozen and thet the recmode is set to active
3037 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3038 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3040 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3042 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3043 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3045 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3047 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3050 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3052 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3059 /* If the local node is stopped, verify we are not the recmaster
3060 and yield this role if so
3062 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3063 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3064 force_election(rec, pnn, nodemap);
3068 /* check that we (recovery daemon) and the local ctdb daemon
3069 agrees on whether we are banned or not
3073 /* remember our own node flags */
3074 rec->node_flags = nodemap->nodes[pnn].flags;
3076 /* count how many active nodes there are */
3077 rec->num_active = 0;
3078 rec->num_connected = 0;
3079 for (i=0; i<nodemap->num; i++) {
3080 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3083 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3084 rec->num_connected++;
3089 /* verify that the recmaster node is still active */
3090 for (j=0; j<nodemap->num; j++) {
3091 if (nodemap->nodes[j].pnn==rec->recmaster) {
3096 if (j == nodemap->num) {
3097 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3098 force_election(rec, pnn, nodemap);
3102 /* if recovery master is disconnected we must elect a new recmaster */
3103 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3104 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3105 force_election(rec, pnn, nodemap);
3109 /* grap the nodemap from the recovery master to check if it is banned */
3110 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3111 mem_ctx, &recmaster_nodemap);
3113 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3114 nodemap->nodes[j].pnn));
3119 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3120 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3121 force_election(rec, pnn, nodemap);
3126 /* verify that we have all ip addresses we should have and we dont
3127 * have addresses we shouldnt have.
3129 if (ctdb->tunable.disable_ip_failover == 0) {
3130 if (rec->ip_check_disable_ctx == NULL) {
3131 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3132 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3138 /* if we are not the recmaster then we do not need to check
3139 if recovery is needed
3141 if (pnn != rec->recmaster) {
3146 /* ensure our local copies of flags are right */
3147 ret = update_local_flags(rec, nodemap);
3148 if (ret == MONITOR_ELECTION_NEEDED) {
3149 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3150 force_election(rec, pnn, nodemap);
3153 if (ret != MONITOR_OK) {
3154 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3158 if (ctdb->num_nodes != nodemap->num) {
3159 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3160 reload_nodes_file(ctdb);
3164 /* verify that all active nodes agree that we are the recmaster */
3165 switch (verify_recmaster(rec, nodemap, pnn)) {
3166 case MONITOR_RECOVERY_NEEDED:
3167 /* can not happen */
3169 case MONITOR_ELECTION_NEEDED:
3170 force_election(rec, pnn, nodemap);
3174 case MONITOR_FAILED:
3179 if (rec->need_recovery) {
3180 /* a previous recovery didn't finish */
3181 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3185 /* verify that all active nodes are in normal mode
3186 and not in recovery mode
3188 switch (verify_recmode(ctdb, nodemap)) {
3189 case MONITOR_RECOVERY_NEEDED:
3190 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3192 case MONITOR_FAILED:
3194 case MONITOR_ELECTION_NEEDED:
3195 /* can not happen */
3201 if (ctdb->tunable.verify_recovery_lock != 0) {
3202 /* we should have the reclock - check its not stale */
3203 ret = check_recovery_lock(ctdb);
3205 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3206 ctdb_set_culprit(rec, ctdb->pnn);
3207 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3212 /* get the nodemap for all active remote nodes
3214 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3215 if (remote_nodemaps == NULL) {
3216 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3219 for(i=0; i<nodemap->num; i++) {
3220 remote_nodemaps[i] = NULL;
3222 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3223 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3227 /* verify that all other nodes have the same nodemap as we have
3229 for (j=0; j<nodemap->num; j++) {
3230 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3234 if (remote_nodemaps[j] == NULL) {
3235 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3236 ctdb_set_culprit(rec, j);
3241 /* if the nodes disagree on how many nodes there are
3242 then this is a good reason to try recovery
3244 if (remote_nodemaps[j]->num != nodemap->num) {
3245 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3246 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3247 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3248 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3252 /* if the nodes disagree on which nodes exist and are
3253 active, then that is also a good reason to do recovery
3255 for (i=0;i<nodemap->num;i++) {
3256 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3257 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3258 nodemap->nodes[j].pnn, i,
3259 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3260 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3261 do_recovery(rec, mem_ctx, pnn, nodemap,
3267 /* verify the flags are consistent
3269 for (i=0; i<nodemap->num; i++) {
3270 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3274 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3275 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3276 nodemap->nodes[j].pnn,
3277 nodemap->nodes[i].pnn,
3278 remote_nodemaps[j]->nodes[i].flags,
3279 nodemap->nodes[j].flags));
3281 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3282 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3283 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3284 do_recovery(rec, mem_ctx, pnn, nodemap,
3288 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3289 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3290 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3291 do_recovery(rec, mem_ctx, pnn, nodemap,
3300 /* there better be the same number of lmasters in the vnn map
3301 as there are active nodes or we will have to do a recovery
3303 if (vnnmap->size != rec->num_active) {
3304 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3305 vnnmap->size, rec->num_active));
3306 ctdb_set_culprit(rec, ctdb->pnn);
3307 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3311 /* verify that all active nodes in the nodemap also exist in
3314 for (j=0; j<nodemap->num; j++) {
3315 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3318 if (nodemap->nodes[j].pnn == pnn) {
3322 for (i=0; i<vnnmap->size; i++) {
3323 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3327 if (i == vnnmap->size) {
3328 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3329 nodemap->nodes[j].pnn));
3330 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3331 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3337 /* verify that all other nodes have the same vnnmap
3338 and are from the same generation
3340 for (j=0; j<nodemap->num; j++) {
3341 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3344 if (nodemap->nodes[j].pnn == pnn) {
3348 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3349 mem_ctx, &remote_vnnmap);
3351 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3352 nodemap->nodes[j].pnn));
3356 /* verify the vnnmap generation is the same */
3357 if (vnnmap->generation != remote_vnnmap->generation) {
3358 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3359 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3360 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3361 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3365 /* verify the vnnmap size is the same */
3366 if (vnnmap->size != remote_vnnmap->size) {
3367 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3368 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3369 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3370 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3374 /* verify the vnnmap is the same */
3375 for (i=0;i<vnnmap->size;i++) {
3376 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3377 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3378 nodemap->nodes[j].pnn));
3379 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3380 do_recovery(rec, mem_ctx, pnn, nodemap,
3387 /* we might need to change who has what IP assigned */
3388 if (rec->need_takeover_run) {
3389 uint32_t culprit = (uint32_t)-1;
3391 rec->need_takeover_run = false;
3393 /* update the list of public ips that a node can handle for
3396 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3398 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3400 rec->need_takeover_run = true;
3404 /* execute the "startrecovery" event script on all nodes */
3405 ret = run_startrecovery_eventscript(rec, nodemap);
3407 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3408 ctdb_set_culprit(rec, ctdb->pnn);
3409 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3413 ret = ctdb_takeover_run(ctdb, nodemap);
3415 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
3419 /* execute the "recovered" event script on all nodes */
3420 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3422 // we cant check whether the event completed successfully
3423 // since this script WILL fail if the node is in recovery mode
3424 // and if that race happens, the code here would just cause a second
3425 // cascading recovery.
3427 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3428 ctdb_set_culprit(rec, ctdb->pnn);
3429 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3436 the main monitoring loop
3438 static void monitor_cluster(struct ctdb_context *ctdb)
3440 struct ctdb_recoverd *rec;
3442 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3444 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3445 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3449 rec->priority_time = timeval_current();
3451 /* register a message port for sending memory dumps */
3452 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3454 /* register a message port for recovery elections */
3455 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3457 /* when nodes are disabled/enabled */
3458 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3460 /* when we are asked to puch out a flag change */
3461 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3463 /* register a message port for vacuum fetch */
3464 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3466 /* register a message port for reloadnodes */
3467 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3469 /* register a message port for performing a takeover run */
3470 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3472 /* register a message port for disabling the ip check for a short while */
3473 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3475 /* register a message port for updating the recovery daemons node assignment for an ip */
3476 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3479 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3480 struct timeval start;
3484 DEBUG(DEBUG_CRIT,(__location__
3485 " Failed to create temp context\n"));
3489 start = timeval_current();
3490 main_loop(ctdb, rec, mem_ctx);
3491 talloc_free(mem_ctx);
3493 /* we only check for recovery once every second */
3494 elapsed = timeval_elapsed(&start);
3495 if (elapsed < ctdb->tunable.recover_interval) {
3496 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3503 event handler for when the main ctdbd dies
3505 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3506 uint16_t flags, void *private_data)
3508 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3513 called regularly to verify that the recovery daemon is still running
3515 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3516 struct timeval yt, void *p)
3518 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3520 if (kill(ctdb->recoverd_pid, 0) != 0) {
3521 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3523 ctdb_stop_recoverd(ctdb);
3524 ctdb_stop_keepalive(ctdb);
3525 ctdb_stop_monitoring(ctdb);
3526 ctdb_release_all_ips(ctdb);
3527 if (ctdb->methods != NULL) {
3528 ctdb->methods->shutdown(ctdb);
3530 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3535 event_add_timed(ctdb->ev, ctdb,
3536 timeval_current_ofs(30, 0),
3537 ctdb_check_recd, ctdb);
3540 static void recd_sig_child_handler(struct event_context *ev,
3541 struct signal_event *se, int signum, int count,
3545 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3550 pid = waitpid(-1, &status, WNOHANG);
3552 if (errno != ECHILD) {
3553 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3558 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3564 startup the recovery daemon as a child of the main ctdb daemon
3566 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3569 struct signal_event *se;
3570 struct tevent_fd *fde;
3572 if (pipe(fd) != 0) {
3576 ctdb->ctdbd_pid = getpid();
3578 ctdb->recoverd_pid = fork();
3579 if (ctdb->recoverd_pid == -1) {
3583 if (ctdb->recoverd_pid != 0) {
3585 event_add_timed(ctdb->ev, ctdb,
3586 timeval_current_ofs(30, 0),
3587 ctdb_check_recd, ctdb);
3593 srandom(getpid() ^ time(NULL));
3595 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3596 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3600 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3602 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3603 ctdb_recoverd_parent, &fd[0]);
3604 tevent_fd_set_auto_close(fde);
3606 /* set up a handler to pick up sigchld */
3607 se = event_add_signal(ctdb->ev, ctdb,
3609 recd_sig_child_handler,
3612 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3616 monitor_cluster(ctdb);
3618 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3623 shutdown the recovery daemon
3625 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3627 if (ctdb->recoverd_pid == 0) {
3631 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3632 kill(ctdb->recoverd_pid, SIGTERM);