4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
75 ban a node for a period of time
77 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
80 struct ctdb_context *ctdb = rec->ctdb;
81 struct ctdb_ban_time bantime;
83 if (!ctdb_validate_pnn(ctdb, pnn)) {
84 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
88 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
91 bantime.time = ban_time;
93 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
95 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
101 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
105 run the "recovered" eventscript on all nodes
107 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
112 tmp_ctx = talloc_new(ctdb);
113 CTDB_NO_MEMORY(ctdb, tmp_ctx);
115 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
116 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
118 CONTROL_TIMEOUT(), false, tdb_null,
121 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
123 talloc_free(tmp_ctx);
127 talloc_free(tmp_ctx);
132 remember the trouble maker
134 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
136 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
137 struct ctdb_banning_state *ban_state;
139 if (culprit > ctdb->num_nodes) {
140 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
144 if (ctdb->nodes[culprit]->ban_state == NULL) {
145 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
146 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
150 ban_state = ctdb->nodes[culprit]->ban_state;
151 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
152 /* this was the first time in a long while this node
153 misbehaved so we will forgive any old transgressions.
155 ban_state->count = 0;
158 ban_state->count += count;
159 ban_state->last_reported_time = timeval_current();
160 rec->last_culprit_node = culprit;
164 remember the trouble maker
166 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
168 ctdb_set_culprit_count(rec, culprit, 1);
172 /* this callback is called for every node that failed to execute the
175 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
177 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
179 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
181 ctdb_set_culprit(rec, node_pnn);
185 run the "startrecovery" eventscript on all nodes
187 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
191 struct ctdb_context *ctdb = rec->ctdb;
193 tmp_ctx = talloc_new(ctdb);
194 CTDB_NO_MEMORY(ctdb, tmp_ctx);
196 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
197 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
199 CONTROL_TIMEOUT(), false, tdb_null,
201 startrecovery_fail_callback,
203 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
204 talloc_free(tmp_ctx);
208 talloc_free(tmp_ctx);
212 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
214 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
215 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
218 if (node_pnn < ctdb->num_nodes) {
219 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
222 if (node_pnn == ctdb->pnn) {
223 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
228 update the node capabilities for all connected nodes
230 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
235 tmp_ctx = talloc_new(ctdb);
236 CTDB_NO_MEMORY(ctdb, tmp_ctx);
238 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
239 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
243 async_getcap_callback, NULL,
245 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
246 talloc_free(tmp_ctx);
250 talloc_free(tmp_ctx);
254 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
256 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
258 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
259 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
262 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
264 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
266 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
267 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
271 change recovery mode on all nodes
273 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
279 tmp_ctx = talloc_new(ctdb);
280 CTDB_NO_MEMORY(ctdb, tmp_ctx);
282 /* freeze all nodes */
283 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
284 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
287 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
288 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
293 set_recmode_fail_callback,
295 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
296 talloc_free(tmp_ctx);
303 data.dsize = sizeof(uint32_t);
304 data.dptr = (unsigned char *)&rec_mode;
306 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
312 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
313 talloc_free(tmp_ctx);
317 talloc_free(tmp_ctx);
322 change recovery master on all node
324 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
330 tmp_ctx = talloc_new(ctdb);
331 CTDB_NO_MEMORY(ctdb, tmp_ctx);
333 data.dsize = sizeof(uint32_t);
334 data.dptr = (unsigned char *)&pnn;
336 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
337 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
339 CONTROL_TIMEOUT(), false, data,
342 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
343 talloc_free(tmp_ctx);
347 talloc_free(tmp_ctx);
351 /* update all remote nodes to use the same db priority that we have
352 this can fail if the remove node has not yet been upgraded to
353 support this function, so we always return success and never fail
354 a recovery if this call fails.
356 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
357 struct ctdb_node_map *nodemap,
358 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
363 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
365 /* step through all local databases */
366 for (db=0; db<dbmap->num;db++) {
368 struct ctdb_db_priority db_prio;
371 db_prio.db_id = dbmap->dbs[db].dbid;
372 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
374 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
378 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
380 data.dptr = (uint8_t *)&db_prio;
381 data.dsize = sizeof(db_prio);
383 if (ctdb_client_async_control(ctdb,
384 CTDB_CONTROL_SET_DB_PRIORITY,
386 CONTROL_TIMEOUT(), false, data,
389 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
397 ensure all other nodes have attached to any databases that we have
399 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
400 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
403 struct ctdb_dbid_map *remote_dbmap;
405 /* verify that all other nodes have all our databases */
406 for (j=0; j<nodemap->num; j++) {
407 /* we dont need to ourself ourselves */
408 if (nodemap->nodes[j].pnn == pnn) {
411 /* dont check nodes that are unavailable */
412 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
416 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
417 mem_ctx, &remote_dbmap);
419 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
423 /* step through all local databases */
424 for (db=0; db<dbmap->num;db++) {
428 for (i=0;i<remote_dbmap->num;i++) {
429 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
433 /* the remote node already have this database */
434 if (i!=remote_dbmap->num) {
437 /* ok so we need to create this database */
438 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
441 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
444 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
445 mem_ctx, name, dbmap->dbs[db].persistent);
447 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
458 ensure we are attached to any databases that anyone else is attached to
460 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
461 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
464 struct ctdb_dbid_map *remote_dbmap;
466 /* verify that we have all database any other node has */
467 for (j=0; j<nodemap->num; j++) {
468 /* we dont need to ourself ourselves */
469 if (nodemap->nodes[j].pnn == pnn) {
472 /* dont check nodes that are unavailable */
473 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
477 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
478 mem_ctx, &remote_dbmap);
480 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
484 /* step through all databases on the remote node */
485 for (db=0; db<remote_dbmap->num;db++) {
488 for (i=0;i<(*dbmap)->num;i++) {
489 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
493 /* we already have this db locally */
494 if (i!=(*dbmap)->num) {
497 /* ok so we need to create this database and
500 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
501 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
503 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
504 nodemap->nodes[j].pnn));
507 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
508 remote_dbmap->dbs[db].persistent);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
513 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
515 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
526 pull the remote database contents from one node into the recdb
528 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
529 struct tdb_wrap *recdb, uint32_t dbid,
534 struct ctdb_marshall_buffer *reply;
535 struct ctdb_rec_data *rec;
537 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
539 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
540 CONTROL_TIMEOUT(), &outdata);
542 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
543 talloc_free(tmp_ctx);
547 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
549 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
550 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
551 talloc_free(tmp_ctx);
555 rec = (struct ctdb_rec_data *)&reply->data[0];
559 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
561 struct ctdb_ltdb_header *hdr;
564 key.dptr = &rec->data[0];
565 key.dsize = rec->keylen;
566 data.dptr = &rec->data[key.dsize];
567 data.dsize = rec->datalen;
569 hdr = (struct ctdb_ltdb_header *)data.dptr;
571 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
572 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
573 talloc_free(tmp_ctx);
577 /* fetch the existing record, if any */
578 existing = tdb_fetch(recdb->tdb, key);
580 if (existing.dptr != NULL) {
581 struct ctdb_ltdb_header header;
582 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
583 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
584 (unsigned)existing.dsize, srcnode));
586 talloc_free(tmp_ctx);
589 header = *(struct ctdb_ltdb_header *)existing.dptr;
591 if (!(header.rsn < hdr->rsn ||
592 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
597 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
598 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
599 talloc_free(tmp_ctx);
604 talloc_free(tmp_ctx);
610 pull all the remote database contents into the recdb
612 static int pull_remote_database(struct ctdb_context *ctdb,
613 struct ctdb_recoverd *rec,
614 struct ctdb_node_map *nodemap,
615 struct tdb_wrap *recdb, uint32_t dbid,
620 /* pull all records from all other nodes across onto this node
621 (this merges based on rsn)
623 for (j=0; j<nodemap->num; j++) {
624 /* dont merge from nodes that are unavailable */
625 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
628 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
629 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
630 nodemap->nodes[j].pnn));
631 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
641 update flags on all active nodes
643 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
647 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
649 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
657 ensure all nodes have the same vnnmap we do
659 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
660 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
664 /* push the new vnn map out to all the nodes */
665 for (j=0; j<nodemap->num; j++) {
666 /* dont push to nodes that are unavailable */
667 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
671 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
673 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
683 struct vacuum_info *next, *prev;
684 struct ctdb_recoverd *rec;
686 struct ctdb_db_context *ctdb_db;
687 struct ctdb_marshall_buffer *recs;
688 struct ctdb_rec_data *r;
691 static void vacuum_fetch_next(struct vacuum_info *v);
694 called when a vacuum fetch has completed - just free it and do the next one
696 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
698 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
700 vacuum_fetch_next(v);
705 process the next element from the vacuum list
707 static void vacuum_fetch_next(struct vacuum_info *v)
709 struct ctdb_call call;
710 struct ctdb_rec_data *r;
712 while (v->recs->count) {
713 struct ctdb_client_call_state *state;
715 struct ctdb_ltdb_header *hdr;
718 call.call_id = CTDB_NULL_FUNC;
719 call.flags = CTDB_IMMEDIATE_MIGRATION;
720 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
723 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
726 call.key.dptr = &r->data[0];
727 call.key.dsize = r->keylen;
729 /* ensure we don't block this daemon - just skip a record if we can't get
731 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
735 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
736 if (data.dptr == NULL) {
737 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
741 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
743 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
747 hdr = (struct ctdb_ltdb_header *)data.dptr;
748 if (hdr->dmaster == v->rec->ctdb->pnn) {
749 /* its already local */
751 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
757 state = ctdb_call_send(v->ctdb_db, &call);
758 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
760 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
764 state->async.fn = vacuum_fetch_callback;
765 state->async.private_data = v;
774 destroy a vacuum info structure
776 static int vacuum_info_destructor(struct vacuum_info *v)
778 DLIST_REMOVE(v->rec->vacuum_info, v);
784 handler for vacuum fetch
786 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
787 TDB_DATA data, void *private_data)
789 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
790 struct ctdb_marshall_buffer *recs;
792 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
794 struct ctdb_dbid_map *dbmap=NULL;
795 bool persistent = false;
796 struct ctdb_db_context *ctdb_db;
797 struct ctdb_rec_data *r;
799 struct vacuum_info *v;
801 recs = (struct ctdb_marshall_buffer *)data.dptr;
802 r = (struct ctdb_rec_data *)&recs->data[0];
804 if (recs->count == 0) {
805 talloc_free(tmp_ctx);
811 for (v=rec->vacuum_info;v;v=v->next) {
812 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
813 /* we're already working on records from this node */
814 talloc_free(tmp_ctx);
819 /* work out if the database is persistent */
820 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
822 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
823 talloc_free(tmp_ctx);
827 for (i=0;i<dbmap->num;i++) {
828 if (dbmap->dbs[i].dbid == recs->db_id) {
829 persistent = dbmap->dbs[i].persistent;
833 if (i == dbmap->num) {
834 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
835 talloc_free(tmp_ctx);
839 /* find the name of this database */
840 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
841 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
842 talloc_free(tmp_ctx);
847 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
848 if (ctdb_db == NULL) {
849 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
850 talloc_free(tmp_ctx);
854 v = talloc_zero(rec, struct vacuum_info);
856 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
857 talloc_free(tmp_ctx);
862 v->srcnode = srcnode;
863 v->ctdb_db = ctdb_db;
864 v->recs = talloc_memdup(v, recs, data.dsize);
865 if (v->recs == NULL) {
866 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
868 talloc_free(tmp_ctx);
871 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
873 DLIST_ADD(rec->vacuum_info, v);
875 talloc_set_destructor(v, vacuum_info_destructor);
877 vacuum_fetch_next(v);
878 talloc_free(tmp_ctx);
883 called when ctdb_wait_timeout should finish
885 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
886 struct timeval yt, void *p)
888 uint32_t *timed_out = (uint32_t *)p;
893 wait for a given number of seconds
895 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
897 uint32_t timed_out = 0;
898 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
900 event_loop_once(ctdb->ev);
905 called when an election times out (ends)
907 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
908 struct timeval t, void *p)
910 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
911 rec->election_timeout = NULL;
913 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
918 wait for an election to finish. It finished election_timeout seconds after
919 the last election packet is received
921 static void ctdb_wait_election(struct ctdb_recoverd *rec)
923 struct ctdb_context *ctdb = rec->ctdb;
924 while (rec->election_timeout) {
925 event_loop_once(ctdb->ev);
930 Update our local flags from all remote connected nodes.
931 This is only run when we are or we belive we are the recovery master
933 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
936 struct ctdb_context *ctdb = rec->ctdb;
937 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
939 /* get the nodemap for all active remote nodes and verify
940 they are the same as for this node
942 for (j=0; j<nodemap->num; j++) {
943 struct ctdb_node_map *remote_nodemap=NULL;
946 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
949 if (nodemap->nodes[j].pnn == ctdb->pnn) {
953 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
954 mem_ctx, &remote_nodemap);
956 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
957 nodemap->nodes[j].pnn));
958 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
959 talloc_free(mem_ctx);
960 return MONITOR_FAILED;
962 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
963 /* We should tell our daemon about this so it
964 updates its flags or else we will log the same
965 message again in the next iteration of recovery.
966 Since we are the recovery master we can just as
967 well update the flags on all nodes.
969 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
971 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
975 /* Update our local copy of the flags in the recovery
978 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
979 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
980 nodemap->nodes[j].flags));
981 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
983 talloc_free(remote_nodemap);
985 talloc_free(mem_ctx);
990 /* Create a new random generation ip.
991 The generation id can not be the INVALID_GENERATION id
993 static uint32_t new_generation(void)
998 generation = random();
1000 if (generation != INVALID_GENERATION) {
1010 create a temporary working database
1012 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1015 struct tdb_wrap *recdb;
1018 /* open up the temporary recovery database */
1019 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1020 ctdb->db_directory_state,
1027 tdb_flags = TDB_NOLOCK;
1028 if (ctdb->valgrinding) {
1029 tdb_flags |= TDB_NOMMAP;
1031 tdb_flags |= TDB_DISALLOW_NESTING;
1033 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1034 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1035 if (recdb == NULL) {
1036 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1046 a traverse function for pulling all relevent records from recdb
1049 struct ctdb_context *ctdb;
1050 struct ctdb_marshall_buffer *recdata;
1056 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1058 struct recdb_data *params = (struct recdb_data *)p;
1059 struct ctdb_rec_data *rec;
1060 struct ctdb_ltdb_header *hdr;
1062 /* skip empty records */
1063 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1067 /* update the dmaster field to point to us */
1068 hdr = (struct ctdb_ltdb_header *)data.dptr;
1069 if (!params->persistent) {
1070 hdr->dmaster = params->ctdb->pnn;
1071 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1074 /* add the record to the blob ready to send to the nodes */
1075 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1077 params->failed = true;
1080 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1081 if (params->recdata == NULL) {
1082 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1083 rec->length + params->len));
1084 params->failed = true;
1087 params->recdata->count++;
1088 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1089 params->len += rec->length;
1096 push the recdb database out to all nodes
1098 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1100 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1102 struct recdb_data params;
1103 struct ctdb_marshall_buffer *recdata;
1105 TALLOC_CTX *tmp_ctx;
1108 tmp_ctx = talloc_new(ctdb);
1109 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1111 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1112 CTDB_NO_MEMORY(ctdb, recdata);
1114 recdata->db_id = dbid;
1117 params.recdata = recdata;
1118 params.len = offsetof(struct ctdb_marshall_buffer, data);
1119 params.failed = false;
1120 params.persistent = persistent;
1122 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1123 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1124 talloc_free(params.recdata);
1125 talloc_free(tmp_ctx);
1129 if (params.failed) {
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1131 talloc_free(params.recdata);
1132 talloc_free(tmp_ctx);
1136 recdata = params.recdata;
1138 outdata.dptr = (void *)recdata;
1139 outdata.dsize = params.len;
1141 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1142 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1144 CONTROL_TIMEOUT(), false, outdata,
1147 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1148 talloc_free(recdata);
1149 talloc_free(tmp_ctx);
1153 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1154 dbid, recdata->count));
1156 talloc_free(recdata);
1157 talloc_free(tmp_ctx);
1164 go through a full recovery on one database
1166 static int recover_database(struct ctdb_recoverd *rec,
1167 TALLOC_CTX *mem_ctx,
1171 struct ctdb_node_map *nodemap,
1172 uint32_t transaction_id)
1174 struct tdb_wrap *recdb;
1176 struct ctdb_context *ctdb = rec->ctdb;
1178 struct ctdb_control_wipe_database w;
1181 recdb = create_recdb(ctdb, mem_ctx);
1182 if (recdb == NULL) {
1186 /* pull all remote databases onto the recdb */
1187 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1189 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1193 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1195 /* wipe all the remote databases. This is safe as we are in a transaction */
1197 w.transaction_id = transaction_id;
1199 data.dptr = (void *)&w;
1200 data.dsize = sizeof(w);
1202 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1203 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1205 CONTROL_TIMEOUT(), false, data,
1208 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1213 /* push out the correct database. This sets the dmaster and skips
1214 the empty records */
1215 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1221 /* all done with this database */
1228 reload the nodes file
1230 static void reload_nodes_file(struct ctdb_context *ctdb)
1233 ctdb_load_nodes_file(ctdb);
1236 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1237 struct ctdb_recoverd *rec,
1238 struct ctdb_node_map *nodemap,
1244 if (ctdb->num_nodes != nodemap->num) {
1245 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1246 ctdb->num_nodes, nodemap->num));
1248 *culprit = ctdb->pnn;
1253 for (j=0; j<nodemap->num; j++) {
1254 /* release any existing data */
1255 if (ctdb->nodes[j]->known_public_ips) {
1256 talloc_free(ctdb->nodes[j]->known_public_ips);
1257 ctdb->nodes[j]->known_public_ips = NULL;
1259 if (ctdb->nodes[j]->available_public_ips) {
1260 talloc_free(ctdb->nodes[j]->available_public_ips);
1261 ctdb->nodes[j]->available_public_ips = NULL;
1264 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1268 /* grab a new shiny list of public ips from the node */
1269 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1271 ctdb->nodes[j]->pnn,
1274 &ctdb->nodes[j]->known_public_ips);
1276 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1277 ctdb->nodes[j]->pnn));
1279 *culprit = ctdb->nodes[j]->pnn;
1284 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1285 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1286 rec->need_takeover_run = true;
1289 /* grab a new shiny list of public ips from the node */
1290 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1292 ctdb->nodes[j]->pnn,
1294 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1295 &ctdb->nodes[j]->available_public_ips);
1297 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1298 ctdb->nodes[j]->pnn));
1300 *culprit = ctdb->nodes[j]->pnn;
1310 we are the recmaster, and recovery is needed - start a recovery run
1312 static int do_recovery(struct ctdb_recoverd *rec,
1313 TALLOC_CTX *mem_ctx, uint32_t pnn,
1314 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1316 struct ctdb_context *ctdb = rec->ctdb;
1318 uint32_t generation;
1319 struct ctdb_dbid_map *dbmap;
1322 struct timeval start_time;
1323 uint32_t culprit = (uint32_t)-1;
1325 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1327 /* if recovery fails, force it again */
1328 rec->need_recovery = true;
1330 for (i=0; i<ctdb->num_nodes; i++) {
1331 struct ctdb_banning_state *ban_state;
1333 if (ctdb->nodes[i]->ban_state == NULL) {
1336 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1337 if (ban_state->count < 2*ctdb->num_nodes) {
1340 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1341 ctdb->nodes[i]->pnn, ban_state->count,
1342 ctdb->tunable.recovery_ban_period));
1343 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1344 ban_state->count = 0;
1348 if (ctdb->tunable.verify_recovery_lock != 0) {
1349 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1350 start_time = timeval_current();
1351 if (!ctdb_recovery_lock(ctdb, true)) {
1352 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1353 "and ban ourself for %u seconds\n",
1354 ctdb->tunable.recovery_ban_period));
1355 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1358 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1359 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1362 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1364 /* get a list of all databases */
1365 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1367 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1371 /* we do the db creation before we set the recovery mode, so the freeze happens
1372 on all databases we will be dealing with. */
1374 /* verify that we have all the databases any other node has */
1375 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1381 /* verify that all other nodes have all our databases */
1382 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1384 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1387 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1389 /* update the database priority for all remote databases */
1390 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1392 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1394 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1397 /* set recovery mode to active on all nodes */
1398 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1400 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1404 /* execute the "startrecovery" event script on all nodes */
1405 ret = run_startrecovery_eventscript(rec, nodemap);
1407 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1412 update all nodes to have the same flags that we have
1414 for (i=0;i<nodemap->num;i++) {
1415 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1419 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1421 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1426 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1428 /* pick a new generation number */
1429 generation = new_generation();
1431 /* change the vnnmap on this node to use the new generation
1432 number but not on any other nodes.
1433 this guarantees that if we abort the recovery prematurely
1434 for some reason (a node stops responding?)
1435 that we can just return immediately and we will reenter
1436 recovery shortly again.
1437 I.e. we deliberately leave the cluster with an inconsistent
1438 generation id to allow us to abort recovery at any stage and
1439 just restart it from scratch.
1441 vnnmap->generation = generation;
1442 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1444 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1448 data.dptr = (void *)&generation;
1449 data.dsize = sizeof(uint32_t);
1451 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1452 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1454 CONTROL_TIMEOUT(), false, data,
1456 transaction_start_fail_callback,
1458 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1459 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1461 CONTROL_TIMEOUT(), false, tdb_null,
1465 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1470 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1472 for (i=0;i<dbmap->num;i++) {
1473 ret = recover_database(rec, mem_ctx,
1475 dbmap->dbs[i].persistent,
1476 pnn, nodemap, generation);
1478 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1483 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1485 /* commit all the changes */
1486 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1488 CONTROL_TIMEOUT(), false, data,
1491 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1495 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1498 /* update the capabilities for all nodes */
1499 ret = update_capabilities(ctdb, nodemap);
1501 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1505 /* build a new vnn map with all the currently active and
1507 generation = new_generation();
1508 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1509 CTDB_NO_MEMORY(ctdb, vnnmap);
1510 vnnmap->generation = generation;
1512 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1513 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1514 for (i=j=0;i<nodemap->num;i++) {
1515 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1518 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1519 /* this node can not be an lmaster */
1520 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1525 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1526 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1527 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1530 if (vnnmap->size == 0) {
1531 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1533 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1534 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1535 vnnmap->map[0] = pnn;
1538 /* update to the new vnnmap on all nodes */
1539 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1541 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1545 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1547 /* update recmaster to point to us for all nodes */
1548 ret = set_recovery_master(ctdb, nodemap, pnn);
1550 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1554 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1557 update all nodes to have the same flags that we have
1559 for (i=0;i<nodemap->num;i++) {
1560 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1564 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1566 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1571 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1573 /* disable recovery mode */
1574 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1576 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1580 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1583 tell nodes to takeover their public IPs
1585 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1587 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1591 rec->need_takeover_run = false;
1592 ret = ctdb_takeover_run(ctdb, nodemap);
1594 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1597 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1599 /* execute the "recovered" event script on all nodes */
1600 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1602 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1606 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1608 /* send a message to all clients telling them that the cluster
1609 has been reconfigured */
1610 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1612 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1614 rec->need_recovery = false;
1616 /* we managed to complete a full recovery, make sure to forgive
1617 any past sins by the nodes that could now participate in the
1620 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1621 for (i=0;i<nodemap->num;i++) {
1622 struct ctdb_banning_state *ban_state;
1624 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1628 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1629 if (ban_state == NULL) {
1633 ban_state->count = 0;
1637 /* We just finished a recovery successfully.
1638 We now wait for rerecovery_timeout before we allow
1639 another recovery to take place.
1641 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1642 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1643 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1650 elections are won by first checking the number of connected nodes, then
1651 the priority time, then the pnn
1653 struct election_message {
1654 uint32_t num_connected;
1655 struct timeval priority_time;
1657 uint32_t node_flags;
1661 form this nodes election data
1663 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1666 struct ctdb_node_map *nodemap;
1667 struct ctdb_context *ctdb = rec->ctdb;
1671 em->pnn = rec->ctdb->pnn;
1672 em->priority_time = rec->priority_time;
1674 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1676 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1680 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1681 em->node_flags = rec->node_flags;
1683 for (i=0;i<nodemap->num;i++) {
1684 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1685 em->num_connected++;
1689 /* we shouldnt try to win this election if we cant be a recmaster */
1690 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1691 em->num_connected = 0;
1692 em->priority_time = timeval_current();
1695 talloc_free(nodemap);
1699 see if the given election data wins
1701 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1703 struct election_message myem;
1706 ctdb_election_data(rec, &myem);
1708 /* we cant win if we dont have the recmaster capability */
1709 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1713 /* we cant win if we are banned */
1714 if (rec->node_flags & NODE_FLAGS_BANNED) {
1718 /* we cant win if we are stopped */
1719 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1723 /* we will automatically win if the other node is banned */
1724 if (em->node_flags & NODE_FLAGS_BANNED) {
1728 /* we will automatically win if the other node is banned */
1729 if (em->node_flags & NODE_FLAGS_STOPPED) {
1733 /* try to use the most connected node */
1735 cmp = (int)myem.num_connected - (int)em->num_connected;
1738 /* then the longest running node */
1740 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1744 cmp = (int)myem.pnn - (int)em->pnn;
1751 send out an election request
1753 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1756 TDB_DATA election_data;
1757 struct election_message emsg;
1759 struct ctdb_context *ctdb = rec->ctdb;
1761 srvid = CTDB_SRVID_RECOVERY;
1763 ctdb_election_data(rec, &emsg);
1765 election_data.dsize = sizeof(struct election_message);
1766 election_data.dptr = (unsigned char *)&emsg;
1769 /* send an election message to all active nodes */
1770 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1771 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1774 /* A new node that is already frozen has entered the cluster.
1775 The existing nodes are not frozen and dont need to be frozen
1776 until the election has ended and we start the actual recovery
1778 if (update_recmaster == true) {
1779 /* first we assume we will win the election and set
1780 recoverymaster to be ourself on the current node
1782 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1784 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1794 this function will unban all nodes in the cluster
1796 static void unban_all_nodes(struct ctdb_context *ctdb)
1799 struct ctdb_node_map *nodemap;
1800 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1802 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1804 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1808 for (i=0;i<nodemap->num;i++) {
1809 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1810 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1811 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1815 talloc_free(tmp_ctx);
1820 we think we are winning the election - send a broadcast election request
1822 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1824 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1827 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1829 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1832 talloc_free(rec->send_election_te);
1833 rec->send_election_te = NULL;
1837 handler for memory dumps
1839 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1840 TDB_DATA data, void *private_data)
1842 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1845 struct rd_memdump_reply *rd;
1847 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1848 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1849 talloc_free(tmp_ctx);
1852 rd = (struct rd_memdump_reply *)data.dptr;
1854 dump = talloc_zero(tmp_ctx, TDB_DATA);
1856 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1857 talloc_free(tmp_ctx);
1860 ret = ctdb_dump_memory(ctdb, dump);
1862 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1863 talloc_free(tmp_ctx);
1867 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1869 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1871 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1872 talloc_free(tmp_ctx);
1876 talloc_free(tmp_ctx);
1880 handler for reload_nodes
1882 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1883 TDB_DATA data, void *private_data)
1885 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1887 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1889 reload_nodes_file(rec->ctdb);
1893 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1894 struct timeval yt, void *p)
1896 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1898 talloc_free(rec->ip_check_disable_ctx);
1899 rec->ip_check_disable_ctx = NULL;
1903 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1904 TDB_DATA data, void *private_data)
1906 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1907 struct ctdb_public_ip *ip;
1909 if (rec->recmaster != rec->ctdb->pnn) {
1910 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
1914 if (data.dsize != sizeof(struct ctdb_public_ip)) {
1915 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
1919 ip = (struct ctdb_public_ip *)data.dptr;
1921 update_ip_assignment_tree(rec->ctdb, ip);
1925 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1926 TDB_DATA data, void *private_data)
1928 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1931 if (rec->ip_check_disable_ctx != NULL) {
1932 talloc_free(rec->ip_check_disable_ctx);
1933 rec->ip_check_disable_ctx = NULL;
1936 if (data.dsize != sizeof(uint32_t)) {
1937 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1938 "expexting %lu\n", (long unsigned)data.dsize,
1939 (long unsigned)sizeof(uint32_t)));
1942 if (data.dptr == NULL) {
1943 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1947 timeout = *((uint32_t *)data.dptr);
1948 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1950 rec->ip_check_disable_ctx = talloc_new(rec);
1951 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1953 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1958 handler for ip reallocate, just add it to the list of callers and
1959 handle this later in the monitor_cluster loop so we do not recurse
1960 with other callers to takeover_run()
1962 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1963 TDB_DATA data, void *private_data)
1965 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1966 struct ip_reallocate_list *caller;
1968 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1969 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1973 if (rec->ip_reallocate_ctx == NULL) {
1974 rec->ip_reallocate_ctx = talloc_new(rec);
1975 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
1978 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1979 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1981 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1982 caller->next = rec->reallocate_callers;
1983 rec->reallocate_callers = caller;
1988 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1990 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1993 struct ip_reallocate_list *callers;
1996 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1998 /* update the list of public ips that a node can handle for
2001 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2003 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2005 rec->need_takeover_run = true;
2008 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2010 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2012 rec->need_takeover_run = true;
2016 result.dsize = sizeof(int32_t);
2017 result.dptr = (uint8_t *)&ret;
2019 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2021 /* Someone that sent srvid==0 does not want a reply */
2022 if (callers->rd->srvid == 0) {
2025 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2026 "%u:%llu\n", (unsigned)callers->rd->pnn,
2027 (unsigned long long)callers->rd->srvid));
2028 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2030 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2031 "message to %u:%llu\n",
2032 (unsigned)callers->rd->pnn,
2033 (unsigned long long)callers->rd->srvid));
2037 talloc_free(tmp_ctx);
2038 talloc_free(rec->ip_reallocate_ctx);
2039 rec->ip_reallocate_ctx = NULL;
2040 rec->reallocate_callers = NULL;
2046 handler for recovery master elections
2048 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2049 TDB_DATA data, void *private_data)
2051 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2053 struct election_message *em = (struct election_message *)data.dptr;
2054 TALLOC_CTX *mem_ctx;
2056 /* we got an election packet - update the timeout for the election */
2057 talloc_free(rec->election_timeout);
2058 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2059 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2060 ctdb_election_timeout, rec);
2062 mem_ctx = talloc_new(ctdb);
2064 /* someone called an election. check their election data
2065 and if we disagree and we would rather be the elected node,
2066 send a new election message to all other nodes
2068 if (ctdb_election_win(rec, em)) {
2069 if (!rec->send_election_te) {
2070 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2071 timeval_current_ofs(0, 500000),
2072 election_send_request, rec);
2074 talloc_free(mem_ctx);
2075 /*unban_all_nodes(ctdb);*/
2080 talloc_free(rec->send_election_te);
2081 rec->send_election_te = NULL;
2083 if (ctdb->tunable.verify_recovery_lock != 0) {
2084 /* release the recmaster lock */
2085 if (em->pnn != ctdb->pnn &&
2086 ctdb->recovery_lock_fd != -1) {
2087 close(ctdb->recovery_lock_fd);
2088 ctdb->recovery_lock_fd = -1;
2089 unban_all_nodes(ctdb);
2093 /* ok, let that guy become recmaster then */
2094 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2096 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2097 talloc_free(mem_ctx);
2101 talloc_free(mem_ctx);
2107 force the start of the election process
2109 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2110 struct ctdb_node_map *nodemap)
2113 struct ctdb_context *ctdb = rec->ctdb;
2115 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2117 /* set all nodes to recovery mode to stop all internode traffic */
2118 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2120 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2124 talloc_free(rec->election_timeout);
2125 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2126 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2127 ctdb_election_timeout, rec);
2129 ret = send_election_request(rec, pnn, true);
2131 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2135 /* wait for a few seconds to collect all responses */
2136 ctdb_wait_election(rec);
2142 handler for when a node changes its flags
2144 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2145 TDB_DATA data, void *private_data)
2148 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2149 struct ctdb_node_map *nodemap=NULL;
2150 TALLOC_CTX *tmp_ctx;
2152 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2153 int disabled_flag_changed;
2155 if (data.dsize != sizeof(*c)) {
2156 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2160 tmp_ctx = talloc_new(ctdb);
2161 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2163 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2165 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2166 talloc_free(tmp_ctx);
2171 for (i=0;i<nodemap->num;i++) {
2172 if (nodemap->nodes[i].pnn == c->pnn) break;
2175 if (i == nodemap->num) {
2176 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2177 talloc_free(tmp_ctx);
2181 if (nodemap->nodes[i].flags != c->new_flags) {
2182 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2185 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2187 nodemap->nodes[i].flags = c->new_flags;
2189 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2190 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2193 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2194 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2198 ctdb->recovery_master == ctdb->pnn &&
2199 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2200 /* Only do the takeover run if the perm disabled or unhealthy
2201 flags changed since these will cause an ip failover but not
2203 If the node became disconnected or banned this will also
2204 lead to an ip address failover but that is handled
2207 if (disabled_flag_changed) {
2208 rec->need_takeover_run = true;
2212 talloc_free(tmp_ctx);
2216 handler for when we need to push out flag changes ot all other nodes
2218 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2219 TDB_DATA data, void *private_data)
2222 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2223 struct ctdb_node_map *nodemap=NULL;
2224 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2228 /* find the recovery master */
2229 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2231 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2232 talloc_free(tmp_ctx);
2236 /* read the node flags from the recmaster */
2237 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2239 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2240 talloc_free(tmp_ctx);
2243 if (c->pnn >= nodemap->num) {
2244 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2245 talloc_free(tmp_ctx);
2249 /* send the flags update to all connected nodes */
2250 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2252 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2253 nodes, 0, CONTROL_TIMEOUT(),
2257 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2259 talloc_free(tmp_ctx);
2263 talloc_free(tmp_ctx);
2267 struct verify_recmode_normal_data {
2269 enum monitor_result status;
2272 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2274 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2277 /* one more node has responded with recmode data*/
2280 /* if we failed to get the recmode, then return an error and let
2281 the main loop try again.
2283 if (state->state != CTDB_CONTROL_DONE) {
2284 if (rmdata->status == MONITOR_OK) {
2285 rmdata->status = MONITOR_FAILED;
2290 /* if we got a response, then the recmode will be stored in the
2293 if (state->status != CTDB_RECOVERY_NORMAL) {
2294 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2295 rmdata->status = MONITOR_RECOVERY_NEEDED;
2302 /* verify that all nodes are in normal recovery mode */
2303 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2305 struct verify_recmode_normal_data *rmdata;
2306 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2307 struct ctdb_client_control_state *state;
2308 enum monitor_result status;
2311 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2312 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2314 rmdata->status = MONITOR_OK;
2316 /* loop over all active nodes and send an async getrecmode call to
2318 for (j=0; j<nodemap->num; j++) {
2319 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2322 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2324 nodemap->nodes[j].pnn);
2325 if (state == NULL) {
2326 /* we failed to send the control, treat this as
2327 an error and try again next iteration
2329 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2330 talloc_free(mem_ctx);
2331 return MONITOR_FAILED;
2334 /* set up the callback functions */
2335 state->async.fn = verify_recmode_normal_callback;
2336 state->async.private_data = rmdata;
2338 /* one more control to wait for to complete */
2343 /* now wait for up to the maximum number of seconds allowed
2344 or until all nodes we expect a response from has replied
2346 while (rmdata->count > 0) {
2347 event_loop_once(ctdb->ev);
2350 status = rmdata->status;
2351 talloc_free(mem_ctx);
2356 struct verify_recmaster_data {
2357 struct ctdb_recoverd *rec;
2360 enum monitor_result status;
2363 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2365 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2368 /* one more node has responded with recmaster data*/
2371 /* if we failed to get the recmaster, then return an error and let
2372 the main loop try again.
2374 if (state->state != CTDB_CONTROL_DONE) {
2375 if (rmdata->status == MONITOR_OK) {
2376 rmdata->status = MONITOR_FAILED;
2381 /* if we got a response, then the recmaster will be stored in the
2384 if (state->status != rmdata->pnn) {
2385 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2386 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2387 rmdata->status = MONITOR_ELECTION_NEEDED;
2394 /* verify that all nodes agree that we are the recmaster */
2395 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2397 struct ctdb_context *ctdb = rec->ctdb;
2398 struct verify_recmaster_data *rmdata;
2399 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2400 struct ctdb_client_control_state *state;
2401 enum monitor_result status;
2404 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2405 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2409 rmdata->status = MONITOR_OK;
2411 /* loop over all active nodes and send an async getrecmaster call to
2413 for (j=0; j<nodemap->num; j++) {
2414 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2417 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2419 nodemap->nodes[j].pnn);
2420 if (state == NULL) {
2421 /* we failed to send the control, treat this as
2422 an error and try again next iteration
2424 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2425 talloc_free(mem_ctx);
2426 return MONITOR_FAILED;
2429 /* set up the callback functions */
2430 state->async.fn = verify_recmaster_callback;
2431 state->async.private_data = rmdata;
2433 /* one more control to wait for to complete */
2438 /* now wait for up to the maximum number of seconds allowed
2439 or until all nodes we expect a response from has replied
2441 while (rmdata->count > 0) {
2442 event_loop_once(ctdb->ev);
2445 status = rmdata->status;
2446 talloc_free(mem_ctx);
2451 /* called to check that the local allocation of public ip addresses is ok.
2453 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
2455 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2456 struct ctdb_control_get_ifaces *ifaces = NULL;
2457 struct ctdb_all_public_ips *ips = NULL;
2458 struct ctdb_uptime *uptime1 = NULL;
2459 struct ctdb_uptime *uptime2 = NULL;
2461 bool need_iface_check = false;
2462 bool need_takeover_run = false;
2464 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2465 CTDB_CURRENT_NODE, &uptime1);
2467 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2468 talloc_free(mem_ctx);
2473 /* read the interfaces from the local node */
2474 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2476 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2477 talloc_free(mem_ctx);
2482 need_iface_check = true;
2483 } else if (rec->ifaces->num != ifaces->num) {
2484 need_iface_check = true;
2485 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2486 need_iface_check = true;
2489 if (need_iface_check) {
2490 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2491 "local node %u - force takeover run\n",
2493 need_takeover_run = true;
2496 /* read the ip allocation from the local node */
2497 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2499 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2500 talloc_free(mem_ctx);
2504 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2505 CTDB_CURRENT_NODE, &uptime2);
2507 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2508 talloc_free(mem_ctx);
2512 /* skip the check if the startrecovery time has changed */
2513 if (timeval_compare(&uptime1->last_recovery_started,
2514 &uptime2->last_recovery_started) != 0) {
2515 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2516 talloc_free(mem_ctx);
2520 /* skip the check if the endrecovery time has changed */
2521 if (timeval_compare(&uptime1->last_recovery_finished,
2522 &uptime2->last_recovery_finished) != 0) {
2523 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2524 talloc_free(mem_ctx);
2528 /* skip the check if we have started but not finished recovery */
2529 if (timeval_compare(&uptime1->last_recovery_finished,
2530 &uptime1->last_recovery_started) != 1) {
2531 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2532 talloc_free(mem_ctx);
2537 talloc_free(rec->ifaces);
2538 rec->ifaces = talloc_steal(rec, ifaces);
2540 /* verify that we have the ip addresses we should have
2541 and we dont have ones we shouldnt have.
2542 if we find an inconsistency we set recmode to
2543 active on the local node and wait for the recmaster
2544 to do a full blown recovery
2546 for (j=0; j<ips->num; j++) {
2547 if (ips->ips[j].pnn == pnn) {
2548 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2549 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2550 ctdb_addr_to_str(&ips->ips[j].addr)));
2551 need_takeover_run = true;
2554 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2555 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2556 ctdb_addr_to_str(&ips->ips[j].addr)));
2557 need_takeover_run = true;
2562 if (need_takeover_run) {
2563 struct takeover_run_reply rd;
2566 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2570 data.dptr = (uint8_t *)&rd;
2571 data.dsize = sizeof(rd);
2573 ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2575 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2578 talloc_free(mem_ctx);
2583 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2585 struct ctdb_node_map **remote_nodemaps = callback_data;
2587 if (node_pnn >= ctdb->num_nodes) {
2588 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2592 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2596 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2597 struct ctdb_node_map *nodemap,
2598 struct ctdb_node_map **remote_nodemaps)
2602 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2603 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2605 CONTROL_TIMEOUT(), false, tdb_null,
2606 async_getnodemap_callback,
2608 remote_nodemaps) != 0) {
2609 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2617 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2618 struct ctdb_check_reclock_state {
2619 struct ctdb_context *ctdb;
2620 struct timeval start_time;
2623 struct timed_event *te;
2624 struct fd_event *fde;
2625 enum reclock_child_status status;
2628 /* when we free the reclock state we must kill any child process.
2630 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2632 struct ctdb_context *ctdb = state->ctdb;
2634 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2636 if (state->fd[0] != -1) {
2637 close(state->fd[0]);
2640 if (state->fd[1] != -1) {
2641 close(state->fd[1]);
2644 kill(state->child, SIGKILL);
2649 called if our check_reclock child times out. this would happen if
2650 i/o to the reclock file blocks.
2652 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2653 struct timeval t, void *private_data)
2655 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2656 struct ctdb_check_reclock_state);
2658 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2659 state->status = RECLOCK_TIMEOUT;
2662 /* this is called when the child process has completed checking the reclock
2663 file and has written data back to us through the pipe.
2665 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2666 uint16_t flags, void *private_data)
2668 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2669 struct ctdb_check_reclock_state);
2673 /* we got a response from our child process so we can abort the
2676 talloc_free(state->te);
2679 ret = read(state->fd[0], &c, 1);
2680 if (ret != 1 || c != RECLOCK_OK) {
2681 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2682 state->status = RECLOCK_FAILED;
2687 state->status = RECLOCK_OK;
2691 static int check_recovery_lock(struct ctdb_context *ctdb)
2694 struct ctdb_check_reclock_state *state;
2695 pid_t parent = getpid();
2697 if (ctdb->recovery_lock_fd == -1) {
2698 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2702 state = talloc(ctdb, struct ctdb_check_reclock_state);
2703 CTDB_NO_MEMORY(ctdb, state);
2706 state->start_time = timeval_current();
2707 state->status = RECLOCK_CHECKING;
2711 ret = pipe(state->fd);
2714 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2718 state->child = fork();
2719 if (state->child == (pid_t)-1) {
2720 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2721 close(state->fd[0]);
2723 close(state->fd[1]);
2729 if (state->child == 0) {
2730 char cc = RECLOCK_OK;
2731 close(state->fd[0]);
2734 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2735 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2736 cc = RECLOCK_FAILED;
2739 write(state->fd[1], &cc, 1);
2740 /* make sure we die when our parent dies */
2741 while (kill(parent, 0) == 0 || errno != ESRCH) {
2743 write(state->fd[1], &cc, 1);
2747 close(state->fd[1]);
2749 set_close_on_exec(state->fd[0]);
2751 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2753 talloc_set_destructor(state, check_reclock_destructor);
2755 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2756 ctdb_check_reclock_timeout, state);
2757 if (state->te == NULL) {
2758 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2763 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2764 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2765 reclock_child_handler,
2768 if (state->fde == NULL) {
2769 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2774 while (state->status == RECLOCK_CHECKING) {
2775 event_loop_once(ctdb->ev);
2778 if (state->status == RECLOCK_FAILED) {
2779 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2780 close(ctdb->recovery_lock_fd);
2781 ctdb->recovery_lock_fd = -1;
2790 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2792 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2793 const char *reclockfile;
2795 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2796 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2797 talloc_free(tmp_ctx);
2801 if (reclockfile == NULL) {
2802 if (ctdb->recovery_lock_file != NULL) {
2803 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2804 talloc_free(ctdb->recovery_lock_file);
2805 ctdb->recovery_lock_file = NULL;
2806 if (ctdb->recovery_lock_fd != -1) {
2807 close(ctdb->recovery_lock_fd);
2808 ctdb->recovery_lock_fd = -1;
2811 ctdb->tunable.verify_recovery_lock = 0;
2812 talloc_free(tmp_ctx);
2816 if (ctdb->recovery_lock_file == NULL) {
2817 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2818 if (ctdb->recovery_lock_fd != -1) {
2819 close(ctdb->recovery_lock_fd);
2820 ctdb->recovery_lock_fd = -1;
2822 talloc_free(tmp_ctx);
2827 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2828 talloc_free(tmp_ctx);
2832 talloc_free(ctdb->recovery_lock_file);
2833 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2834 ctdb->tunable.verify_recovery_lock = 0;
2835 if (ctdb->recovery_lock_fd != -1) {
2836 close(ctdb->recovery_lock_fd);
2837 ctdb->recovery_lock_fd = -1;
2840 talloc_free(tmp_ctx);
2844 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2845 TALLOC_CTX *mem_ctx)
2848 struct ctdb_node_map *nodemap=NULL;
2849 struct ctdb_node_map *recmaster_nodemap=NULL;
2850 struct ctdb_node_map **remote_nodemaps=NULL;
2851 struct ctdb_vnn_map *vnnmap=NULL;
2852 struct ctdb_vnn_map *remote_vnnmap=NULL;
2853 int32_t debug_level;
2858 /* verify that the main daemon is still running */
2859 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2860 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2864 /* ping the local daemon to tell it we are alive */
2865 ctdb_ctrl_recd_ping(ctdb);
2867 if (rec->election_timeout) {
2868 /* an election is in progress */
2872 /* read the debug level from the parent and update locally */
2873 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2875 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2878 LogLevel = debug_level;
2881 /* We must check if we need to ban a node here but we want to do this
2882 as early as possible so we dont wait until we have pulled the node
2883 map from the local node. thats why we have the hardcoded value 20
2885 for (i=0; i<ctdb->num_nodes; i++) {
2886 struct ctdb_banning_state *ban_state;
2888 if (ctdb->nodes[i]->ban_state == NULL) {
2891 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2892 if (ban_state->count < 20) {
2895 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2896 ctdb->nodes[i]->pnn, ban_state->count,
2897 ctdb->tunable.recovery_ban_period));
2898 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2899 ban_state->count = 0;
2902 /* get relevant tunables */
2903 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2905 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2909 /* get the current recovery lock file from the server */
2910 if (update_recovery_lock_file(ctdb) != 0) {
2911 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2915 /* Make sure that if recovery lock verification becomes disabled when
2918 if (ctdb->tunable.verify_recovery_lock == 0) {
2919 if (ctdb->recovery_lock_fd != -1) {
2920 close(ctdb->recovery_lock_fd);
2921 ctdb->recovery_lock_fd = -1;
2925 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2926 if (pnn == (uint32_t)-1) {
2927 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2931 /* get the vnnmap */
2932 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2934 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2939 /* get number of nodes */
2941 talloc_free(rec->nodemap);
2942 rec->nodemap = NULL;
2945 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2947 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2950 nodemap = rec->nodemap;
2952 /* remember our own node flags */
2953 rec->node_flags = nodemap->nodes[pnn].flags;
2955 /* update the capabilities for all nodes */
2956 ret = update_capabilities(ctdb, nodemap);
2958 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2962 /* check which node is the recovery master */
2963 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2965 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2969 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2970 if (rec->recmaster != pnn) {
2971 if (rec->ip_reallocate_ctx != NULL) {
2972 talloc_free(rec->ip_reallocate_ctx);
2973 rec->ip_reallocate_ctx = NULL;
2974 rec->reallocate_callers = NULL;
2978 if (rec->recmaster == (uint32_t)-1) {
2979 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2980 force_election(rec, pnn, nodemap);
2984 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2985 also frozen and thet the recmode is set to active.
2987 if (nodemap->nodes[pnn].flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2988 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2990 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2992 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2993 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2995 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2997 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3000 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3002 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3009 /* If the local node is stopped, verify we are not the recmaster
3010 and yield this role if so
3012 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
3013 DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
3014 force_election(rec, pnn, nodemap);
3019 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3020 * but we have force an election and try to become the new
3023 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3024 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3025 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3026 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3027 " but we (node %u) have - force an election\n",
3028 rec->recmaster, pnn));
3029 force_election(rec, pnn, nodemap);
3033 /* count how many active nodes there are */
3034 rec->num_active = 0;
3035 rec->num_connected = 0;
3036 for (i=0; i<nodemap->num; i++) {
3037 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3040 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3041 rec->num_connected++;
3046 /* verify that the recmaster node is still active */
3047 for (j=0; j<nodemap->num; j++) {
3048 if (nodemap->nodes[j].pnn==rec->recmaster) {
3053 if (j == nodemap->num) {
3054 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3055 force_election(rec, pnn, nodemap);
3059 /* if recovery master is disconnected we must elect a new recmaster */
3060 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3061 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3062 force_election(rec, pnn, nodemap);
3066 /* grap the nodemap from the recovery master to check if it is banned */
3067 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3068 mem_ctx, &recmaster_nodemap);
3070 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3071 nodemap->nodes[j].pnn));
3076 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3077 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3078 force_election(rec, pnn, nodemap);
3083 /* verify that we have all ip addresses we should have and we dont
3084 * have addresses we shouldnt have.
3086 if (ctdb->do_checkpublicip) {
3087 if (rec->ip_check_disable_ctx == NULL) {
3088 if (verify_local_ip_allocation(ctdb, rec, pnn) != 0) {
3089 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3095 /* if we are not the recmaster then we do not need to check
3096 if recovery is needed
3098 if (pnn != rec->recmaster) {
3103 /* ensure our local copies of flags are right */
3104 ret = update_local_flags(rec, nodemap);
3105 if (ret == MONITOR_ELECTION_NEEDED) {
3106 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3107 force_election(rec, pnn, nodemap);
3110 if (ret != MONITOR_OK) {
3111 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3115 if (ctdb->num_nodes != nodemap->num) {
3116 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3117 reload_nodes_file(ctdb);
3121 /* verify that all active nodes agree that we are the recmaster */
3122 switch (verify_recmaster(rec, nodemap, pnn)) {
3123 case MONITOR_RECOVERY_NEEDED:
3124 /* can not happen */
3126 case MONITOR_ELECTION_NEEDED:
3127 force_election(rec, pnn, nodemap);
3131 case MONITOR_FAILED:
3136 if (rec->need_recovery) {
3137 /* a previous recovery didn't finish */
3138 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3142 /* verify that all active nodes are in normal mode
3143 and not in recovery mode
3145 switch (verify_recmode(ctdb, nodemap)) {
3146 case MONITOR_RECOVERY_NEEDED:
3147 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3149 case MONITOR_FAILED:
3151 case MONITOR_ELECTION_NEEDED:
3152 /* can not happen */
3158 if (ctdb->tunable.verify_recovery_lock != 0) {
3159 /* we should have the reclock - check its not stale */
3160 ret = check_recovery_lock(ctdb);
3162 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3163 ctdb_set_culprit(rec, ctdb->pnn);
3164 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3169 /* if there are takeovers requested, perform it and notify the waiters */
3170 if (rec->reallocate_callers) {
3171 process_ipreallocate_requests(ctdb, rec);
3174 /* get the nodemap for all active remote nodes
3176 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3177 if (remote_nodemaps == NULL) {
3178 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3181 for(i=0; i<nodemap->num; i++) {
3182 remote_nodemaps[i] = NULL;
3184 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3185 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3189 /* verify that all other nodes have the same nodemap as we have
3191 for (j=0; j<nodemap->num; j++) {
3192 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3196 if (remote_nodemaps[j] == NULL) {
3197 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3198 ctdb_set_culprit(rec, j);
3203 /* if the nodes disagree on how many nodes there are
3204 then this is a good reason to try recovery
3206 if (remote_nodemaps[j]->num != nodemap->num) {
3207 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3208 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3209 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3210 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3214 /* if the nodes disagree on which nodes exist and are
3215 active, then that is also a good reason to do recovery
3217 for (i=0;i<nodemap->num;i++) {
3218 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3219 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3220 nodemap->nodes[j].pnn, i,
3221 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3222 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3223 do_recovery(rec, mem_ctx, pnn, nodemap,
3229 /* verify the flags are consistent
3231 for (i=0; i<nodemap->num; i++) {
3232 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3236 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3237 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3238 nodemap->nodes[j].pnn,
3239 nodemap->nodes[i].pnn,
3240 remote_nodemaps[j]->nodes[i].flags,
3241 nodemap->nodes[j].flags));
3243 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3244 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3245 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3246 do_recovery(rec, mem_ctx, pnn, nodemap,
3250 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3251 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3252 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3253 do_recovery(rec, mem_ctx, pnn, nodemap,
3262 /* there better be the same number of lmasters in the vnn map
3263 as there are active nodes or we will have to do a recovery
3265 if (vnnmap->size != rec->num_active) {
3266 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3267 vnnmap->size, rec->num_active));
3268 ctdb_set_culprit(rec, ctdb->pnn);
3269 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3273 /* verify that all active nodes in the nodemap also exist in
3276 for (j=0; j<nodemap->num; j++) {
3277 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3280 if (nodemap->nodes[j].pnn == pnn) {
3284 for (i=0; i<vnnmap->size; i++) {
3285 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3289 if (i == vnnmap->size) {
3290 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3291 nodemap->nodes[j].pnn));
3292 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3293 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3299 /* verify that all other nodes have the same vnnmap
3300 and are from the same generation
3302 for (j=0; j<nodemap->num; j++) {
3303 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3306 if (nodemap->nodes[j].pnn == pnn) {
3310 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3311 mem_ctx, &remote_vnnmap);
3313 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3314 nodemap->nodes[j].pnn));
3318 /* verify the vnnmap generation is the same */
3319 if (vnnmap->generation != remote_vnnmap->generation) {
3320 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3321 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3322 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3323 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3327 /* verify the vnnmap size is the same */
3328 if (vnnmap->size != remote_vnnmap->size) {
3329 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3330 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3331 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3332 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3336 /* verify the vnnmap is the same */
3337 for (i=0;i<vnnmap->size;i++) {
3338 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3339 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3340 nodemap->nodes[j].pnn));
3341 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3342 do_recovery(rec, mem_ctx, pnn, nodemap,
3349 /* we might need to change who has what IP assigned */
3350 if (rec->need_takeover_run) {
3351 uint32_t culprit = (uint32_t)-1;
3353 rec->need_takeover_run = false;
3355 /* update the list of public ips that a node can handle for
3358 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3360 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3362 ctdb_set_culprit(rec, culprit);
3363 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3367 /* execute the "startrecovery" event script on all nodes */
3368 ret = run_startrecovery_eventscript(rec, nodemap);
3370 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3371 ctdb_set_culprit(rec, ctdb->pnn);
3372 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3376 ret = ctdb_takeover_run(ctdb, nodemap);
3378 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3379 ctdb_set_culprit(rec, ctdb->pnn);
3380 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3384 /* execute the "recovered" event script on all nodes */
3385 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3387 // we cant check whether the event completed successfully
3388 // since this script WILL fail if the node is in recovery mode
3389 // and if that race happens, the code here would just cause a second
3390 // cascading recovery.
3392 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3393 ctdb_set_culprit(rec, ctdb->pnn);
3394 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3401 the main monitoring loop
3403 static void monitor_cluster(struct ctdb_context *ctdb)
3405 struct ctdb_recoverd *rec;
3407 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3409 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3410 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3414 rec->priority_time = timeval_current();
3416 /* register a message port for sending memory dumps */
3417 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3419 /* register a message port for recovery elections */
3420 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3422 /* when nodes are disabled/enabled */
3423 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3425 /* when we are asked to puch out a flag change */
3426 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3428 /* register a message port for vacuum fetch */
3429 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3431 /* register a message port for reloadnodes */
3432 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3434 /* register a message port for performing a takeover run */
3435 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3437 /* register a message port for disabling the ip check for a short while */
3438 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3440 /* register a message port for updating the recovery daemons node assignment for an ip */
3441 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3444 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3446 DEBUG(DEBUG_CRIT,(__location__
3447 " Failed to create temp context\n"));
3451 main_loop(ctdb, rec, mem_ctx);
3452 talloc_free(mem_ctx);
3454 /* we only check for recovery once every second */
3455 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
3460 event handler for when the main ctdbd dies
3462 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3463 uint16_t flags, void *private_data)
3465 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3470 called regularly to verify that the recovery daemon is still running
3472 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3473 struct timeval yt, void *p)
3475 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3477 if (kill(ctdb->recoverd_pid, 0) != 0) {
3478 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3480 ctdb_stop_recoverd(ctdb);
3481 ctdb_stop_keepalive(ctdb);
3482 ctdb_stop_monitoring(ctdb);
3483 ctdb_release_all_ips(ctdb);
3484 if (ctdb->methods != NULL) {
3485 ctdb->methods->shutdown(ctdb);
3487 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
3492 event_add_timed(ctdb->ev, ctdb,
3493 timeval_current_ofs(30, 0),
3494 ctdb_check_recd, ctdb);
3497 static void recd_sig_child_handler(struct event_context *ev,
3498 struct signal_event *se, int signum, int count,
3502 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3507 pid = waitpid(-1, &status, WNOHANG);
3509 if (errno != ECHILD) {
3510 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3515 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3521 startup the recovery daemon as a child of the main ctdb daemon
3523 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3526 struct signal_event *se;
3528 if (pipe(fd) != 0) {
3532 ctdb->ctdbd_pid = getpid();
3534 ctdb->recoverd_pid = fork();
3535 if (ctdb->recoverd_pid == -1) {
3539 if (ctdb->recoverd_pid != 0) {
3541 event_add_timed(ctdb->ev, ctdb,
3542 timeval_current_ofs(30, 0),
3543 ctdb_check_recd, ctdb);
3549 srandom(getpid() ^ time(NULL));
3551 if (switch_from_server_to_client(ctdb) != 0) {
3552 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3556 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3558 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3559 ctdb_recoverd_parent, &fd[0]);
3561 /* set up a handler to pick up sigchld */
3562 se = event_add_signal(ctdb->ev, ctdb,
3564 recd_sig_child_handler,
3567 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3571 monitor_cluster(ctdb);
3573 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3578 shutdown the recovery daemon
3580 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3582 if (ctdb->recoverd_pid == 0) {
3586 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3587 kill(ctdb->recoverd_pid, SIGTERM);