4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
68 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
69 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
73 ban a node for a period of time
75 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
78 struct ctdb_context *ctdb = rec->ctdb;
79 struct ctdb_ban_time bantime;
81 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
83 if (!ctdb_validate_pnn(ctdb, pnn)) {
84 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
89 bantime.time = ban_time;
91 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
93 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
99 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
103 run the "recovered" eventscript on all nodes
105 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
110 tmp_ctx = talloc_new(ctdb);
111 CTDB_NO_MEMORY(ctdb, tmp_ctx);
113 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
114 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
116 CONTROL_TIMEOUT(), false, tdb_null,
119 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
121 talloc_free(tmp_ctx);
125 talloc_free(tmp_ctx);
130 remember the trouble maker
132 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
134 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
135 struct ctdb_banning_state *ban_state;
137 if (culprit > ctdb->num_nodes) {
138 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
142 if (ctdb->nodes[culprit]->ban_state == NULL) {
143 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
144 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
148 ban_state = ctdb->nodes[culprit]->ban_state;
149 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
150 /* this was the first time in a long while this node
151 misbehaved so we will forgive any old transgressions.
153 ban_state->count = 0;
156 ban_state->count += count;
157 ban_state->last_reported_time = timeval_current();
158 rec->last_culprit_node = culprit;
162 remember the trouble maker
164 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
166 ctdb_set_culprit_count(rec, culprit, 1);
170 /* this callback is called for every node that failed to execute the
173 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
175 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
177 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
179 ctdb_set_culprit(rec, node_pnn);
183 run the "startrecovery" eventscript on all nodes
185 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
189 struct ctdb_context *ctdb = rec->ctdb;
191 tmp_ctx = talloc_new(ctdb);
192 CTDB_NO_MEMORY(ctdb, tmp_ctx);
194 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
195 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
197 CONTROL_TIMEOUT(), false, tdb_null,
199 startrecovery_fail_callback,
201 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
202 talloc_free(tmp_ctx);
206 talloc_free(tmp_ctx);
210 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
212 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
213 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
216 if (node_pnn < ctdb->num_nodes) {
217 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
222 update the node capabilities for all connected nodes
224 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
229 tmp_ctx = talloc_new(ctdb);
230 CTDB_NO_MEMORY(ctdb, tmp_ctx);
232 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
233 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
234 nodes, CONTROL_TIMEOUT(),
236 async_getcap_callback, NULL,
238 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
239 talloc_free(tmp_ctx);
243 talloc_free(tmp_ctx);
248 change recovery mode on all nodes
250 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
256 tmp_ctx = talloc_new(ctdb);
257 CTDB_NO_MEMORY(ctdb, tmp_ctx);
259 /* freeze all nodes */
260 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
261 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
262 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
263 nodes, CONTROL_TIMEOUT(),
267 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
268 talloc_free(tmp_ctx);
274 data.dsize = sizeof(uint32_t);
275 data.dptr = (unsigned char *)&rec_mode;
277 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
278 nodes, CONTROL_TIMEOUT(),
282 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
283 talloc_free(tmp_ctx);
287 talloc_free(tmp_ctx);
292 change recovery master on all node
294 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
300 tmp_ctx = talloc_new(ctdb);
301 CTDB_NO_MEMORY(ctdb, tmp_ctx);
303 data.dsize = sizeof(uint32_t);
304 data.dptr = (unsigned char *)&pnn;
306 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
307 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
309 CONTROL_TIMEOUT(), false, data,
312 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
313 talloc_free(tmp_ctx);
317 talloc_free(tmp_ctx);
323 ensure all other nodes have attached to any databases that we have
325 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
326 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
329 struct ctdb_dbid_map *remote_dbmap;
331 /* verify that all other nodes have all our databases */
332 for (j=0; j<nodemap->num; j++) {
333 /* we dont need to ourself ourselves */
334 if (nodemap->nodes[j].pnn == pnn) {
337 /* dont check nodes that are unavailable */
338 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
342 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
343 mem_ctx, &remote_dbmap);
345 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
349 /* step through all local databases */
350 for (db=0; db<dbmap->num;db++) {
354 for (i=0;i<remote_dbmap->num;i++) {
355 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
359 /* the remote node already have this database */
360 if (i!=remote_dbmap->num) {
363 /* ok so we need to create this database */
364 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
367 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
370 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
371 mem_ctx, name, dbmap->dbs[db].persistent);
373 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
384 ensure we are attached to any databases that anyone else is attached to
386 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
387 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
390 struct ctdb_dbid_map *remote_dbmap;
392 /* verify that we have all database any other node has */
393 for (j=0; j<nodemap->num; j++) {
394 /* we dont need to ourself ourselves */
395 if (nodemap->nodes[j].pnn == pnn) {
398 /* dont check nodes that are unavailable */
399 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
403 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
404 mem_ctx, &remote_dbmap);
406 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
410 /* step through all databases on the remote node */
411 for (db=0; db<remote_dbmap->num;db++) {
414 for (i=0;i<(*dbmap)->num;i++) {
415 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
419 /* we already have this db locally */
420 if (i!=(*dbmap)->num) {
423 /* ok so we need to create this database and
426 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
427 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
429 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
430 nodemap->nodes[j].pnn));
433 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
434 remote_dbmap->dbs[db].persistent);
436 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
439 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
441 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
452 pull the remote database contents from one node into the recdb
454 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
455 struct tdb_wrap *recdb, uint32_t dbid)
459 struct ctdb_marshall_buffer *reply;
460 struct ctdb_rec_data *rec;
462 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
464 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
465 CONTROL_TIMEOUT(), &outdata);
467 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
468 talloc_free(tmp_ctx);
472 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
474 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
475 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
476 talloc_free(tmp_ctx);
480 rec = (struct ctdb_rec_data *)&reply->data[0];
484 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
486 struct ctdb_ltdb_header *hdr;
489 key.dptr = &rec->data[0];
490 key.dsize = rec->keylen;
491 data.dptr = &rec->data[key.dsize];
492 data.dsize = rec->datalen;
494 hdr = (struct ctdb_ltdb_header *)data.dptr;
496 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
497 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
498 talloc_free(tmp_ctx);
502 /* fetch the existing record, if any */
503 existing = tdb_fetch(recdb->tdb, key);
505 if (existing.dptr != NULL) {
506 struct ctdb_ltdb_header header;
507 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
508 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
509 (unsigned)existing.dsize, srcnode));
511 talloc_free(tmp_ctx);
514 header = *(struct ctdb_ltdb_header *)existing.dptr;
516 if (!(header.rsn < hdr->rsn ||
517 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
522 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
523 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
524 talloc_free(tmp_ctx);
529 talloc_free(tmp_ctx);
535 pull all the remote database contents into the recdb
537 static int pull_remote_database(struct ctdb_context *ctdb,
538 struct ctdb_recoverd *rec,
539 struct ctdb_node_map *nodemap,
540 struct tdb_wrap *recdb, uint32_t dbid)
544 /* pull all records from all other nodes across onto this node
545 (this merges based on rsn)
547 for (j=0; j<nodemap->num; j++) {
548 /* dont merge from nodes that are unavailable */
549 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
552 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
553 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
554 nodemap->nodes[j].pnn));
555 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
565 update flags on all active nodes
567 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
571 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
573 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
581 ensure all nodes have the same vnnmap we do
583 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
584 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
588 /* push the new vnn map out to all the nodes */
589 for (j=0; j<nodemap->num; j++) {
590 /* dont push to nodes that are unavailable */
591 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
595 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
597 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
607 struct vacuum_info *next, *prev;
608 struct ctdb_recoverd *rec;
610 struct ctdb_db_context *ctdb_db;
611 struct ctdb_marshall_buffer *recs;
612 struct ctdb_rec_data *r;
615 static void vacuum_fetch_next(struct vacuum_info *v);
618 called when a vacuum fetch has completed - just free it and do the next one
620 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
622 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
624 vacuum_fetch_next(v);
629 process the next element from the vacuum list
631 static void vacuum_fetch_next(struct vacuum_info *v)
633 struct ctdb_call call;
634 struct ctdb_rec_data *r;
636 while (v->recs->count) {
637 struct ctdb_client_call_state *state;
639 struct ctdb_ltdb_header *hdr;
642 call.call_id = CTDB_NULL_FUNC;
643 call.flags = CTDB_IMMEDIATE_MIGRATION;
646 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
649 call.key.dptr = &r->data[0];
650 call.key.dsize = r->keylen;
652 /* ensure we don't block this daemon - just skip a record if we can't get
654 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
658 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
659 if (data.dptr == NULL) {
660 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
664 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
666 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
670 hdr = (struct ctdb_ltdb_header *)data.dptr;
671 if (hdr->dmaster == v->rec->ctdb->pnn) {
672 /* its already local */
674 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
680 state = ctdb_call_send(v->ctdb_db, &call);
681 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
683 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
687 state->async.fn = vacuum_fetch_callback;
688 state->async.private_data = v;
697 destroy a vacuum info structure
699 static int vacuum_info_destructor(struct vacuum_info *v)
701 DLIST_REMOVE(v->rec->vacuum_info, v);
707 handler for vacuum fetch
709 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
710 TDB_DATA data, void *private_data)
712 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
713 struct ctdb_marshall_buffer *recs;
715 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
717 struct ctdb_dbid_map *dbmap=NULL;
718 bool persistent = false;
719 struct ctdb_db_context *ctdb_db;
720 struct ctdb_rec_data *r;
722 struct vacuum_info *v;
724 recs = (struct ctdb_marshall_buffer *)data.dptr;
725 r = (struct ctdb_rec_data *)&recs->data[0];
727 if (recs->count == 0) {
728 talloc_free(tmp_ctx);
734 for (v=rec->vacuum_info;v;v=v->next) {
735 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
736 /* we're already working on records from this node */
737 talloc_free(tmp_ctx);
742 /* work out if the database is persistent */
743 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
745 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
746 talloc_free(tmp_ctx);
750 for (i=0;i<dbmap->num;i++) {
751 if (dbmap->dbs[i].dbid == recs->db_id) {
752 persistent = dbmap->dbs[i].persistent;
756 if (i == dbmap->num) {
757 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
758 talloc_free(tmp_ctx);
762 /* find the name of this database */
763 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
764 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
765 talloc_free(tmp_ctx);
770 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
771 if (ctdb_db == NULL) {
772 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
773 talloc_free(tmp_ctx);
777 v = talloc_zero(rec, struct vacuum_info);
779 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
780 talloc_free(tmp_ctx);
785 v->srcnode = srcnode;
786 v->ctdb_db = ctdb_db;
787 v->recs = talloc_memdup(v, recs, data.dsize);
788 if (v->recs == NULL) {
789 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
791 talloc_free(tmp_ctx);
794 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
796 DLIST_ADD(rec->vacuum_info, v);
798 talloc_set_destructor(v, vacuum_info_destructor);
800 vacuum_fetch_next(v);
801 talloc_free(tmp_ctx);
806 called when ctdb_wait_timeout should finish
808 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
809 struct timeval yt, void *p)
811 uint32_t *timed_out = (uint32_t *)p;
816 wait for a given number of seconds
818 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
820 uint32_t timed_out = 0;
821 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
823 event_loop_once(ctdb->ev);
828 called when an election times out (ends)
830 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
831 struct timeval t, void *p)
833 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
834 rec->election_timeout = NULL;
836 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
841 wait for an election to finish. It finished election_timeout seconds after
842 the last election packet is received
844 static void ctdb_wait_election(struct ctdb_recoverd *rec)
846 struct ctdb_context *ctdb = rec->ctdb;
847 while (rec->election_timeout) {
848 event_loop_once(ctdb->ev);
853 Update our local flags from all remote connected nodes.
854 This is only run when we are or we belive we are the recovery master
856 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
859 struct ctdb_context *ctdb = rec->ctdb;
860 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
862 /* get the nodemap for all active remote nodes and verify
863 they are the same as for this node
865 for (j=0; j<nodemap->num; j++) {
866 struct ctdb_node_map *remote_nodemap=NULL;
869 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
872 if (nodemap->nodes[j].pnn == ctdb->pnn) {
876 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
877 mem_ctx, &remote_nodemap);
879 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
880 nodemap->nodes[j].pnn));
881 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
882 talloc_free(mem_ctx);
883 return MONITOR_FAILED;
885 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
886 /* We should tell our daemon about this so it
887 updates its flags or else we will log the same
888 message again in the next iteration of recovery.
889 Since we are the recovery master we can just as
890 well update the flags on all nodes.
892 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
894 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
898 /* Update our local copy of the flags in the recovery
901 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
902 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
903 nodemap->nodes[j].flags));
904 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
906 talloc_free(remote_nodemap);
908 talloc_free(mem_ctx);
913 /* Create a new random generation ip.
914 The generation id can not be the INVALID_GENERATION id
916 static uint32_t new_generation(void)
921 generation = random();
923 if (generation != INVALID_GENERATION) {
933 create a temporary working database
935 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
938 struct tdb_wrap *recdb;
941 /* open up the temporary recovery database */
942 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
948 tdb_flags = TDB_NOLOCK;
949 if (!ctdb->do_setsched) {
950 tdb_flags |= TDB_NOMMAP;
953 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
954 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
956 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
966 a traverse function for pulling all relevent records from recdb
969 struct ctdb_context *ctdb;
970 struct ctdb_marshall_buffer *recdata;
975 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
977 struct recdb_data *params = (struct recdb_data *)p;
978 struct ctdb_rec_data *rec;
979 struct ctdb_ltdb_header *hdr;
981 /* skip empty records */
982 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
986 /* update the dmaster field to point to us */
987 hdr = (struct ctdb_ltdb_header *)data.dptr;
988 hdr->dmaster = params->ctdb->pnn;
990 /* add the record to the blob ready to send to the nodes */
991 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
993 params->failed = true;
996 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
997 if (params->recdata == NULL) {
998 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
999 rec->length + params->len, params->recdata->count));
1000 params->failed = true;
1003 params->recdata->count++;
1004 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1005 params->len += rec->length;
1012 push the recdb database out to all nodes
1014 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1015 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1017 struct recdb_data params;
1018 struct ctdb_marshall_buffer *recdata;
1020 TALLOC_CTX *tmp_ctx;
1023 tmp_ctx = talloc_new(ctdb);
1024 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1026 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1027 CTDB_NO_MEMORY(ctdb, recdata);
1029 recdata->db_id = dbid;
1032 params.recdata = recdata;
1033 params.len = offsetof(struct ctdb_marshall_buffer, data);
1034 params.failed = false;
1036 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1037 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1038 talloc_free(params.recdata);
1039 talloc_free(tmp_ctx);
1043 if (params.failed) {
1044 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1045 talloc_free(params.recdata);
1046 talloc_free(tmp_ctx);
1050 recdata = params.recdata;
1052 outdata.dptr = (void *)recdata;
1053 outdata.dsize = params.len;
1055 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1056 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1058 CONTROL_TIMEOUT(), false, outdata,
1061 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1062 talloc_free(recdata);
1063 talloc_free(tmp_ctx);
1067 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1068 dbid, recdata->count));
1070 talloc_free(recdata);
1071 talloc_free(tmp_ctx);
1078 go through a full recovery on one database
1080 static int recover_database(struct ctdb_recoverd *rec,
1081 TALLOC_CTX *mem_ctx,
1084 struct ctdb_node_map *nodemap,
1085 uint32_t transaction_id)
1087 struct tdb_wrap *recdb;
1089 struct ctdb_context *ctdb = rec->ctdb;
1091 struct ctdb_control_wipe_database w;
1094 recdb = create_recdb(ctdb, mem_ctx);
1095 if (recdb == NULL) {
1099 /* pull all remote databases onto the recdb */
1100 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1102 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1106 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1108 /* wipe all the remote databases. This is safe as we are in a transaction */
1110 w.transaction_id = transaction_id;
1112 data.dptr = (void *)&w;
1113 data.dsize = sizeof(w);
1115 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1116 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1118 CONTROL_TIMEOUT(), false, data,
1121 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1126 /* push out the correct database. This sets the dmaster and skips
1127 the empty records */
1128 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1134 /* all done with this database */
1141 reload the nodes file
1143 static void reload_nodes_file(struct ctdb_context *ctdb)
1146 ctdb_load_nodes_file(ctdb);
1151 we are the recmaster, and recovery is needed - start a recovery run
1153 static int do_recovery(struct ctdb_recoverd *rec,
1154 TALLOC_CTX *mem_ctx, uint32_t pnn,
1155 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1157 struct ctdb_context *ctdb = rec->ctdb;
1159 uint32_t generation;
1160 struct ctdb_dbid_map *dbmap;
1163 struct timeval start_time;
1165 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1167 /* if recovery fails, force it again */
1168 rec->need_recovery = true;
1170 for (i=0; i<ctdb->num_nodes; i++) {
1171 struct ctdb_banning_state *ban_state;
1173 if (ctdb->nodes[i]->ban_state == NULL) {
1176 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1177 if (ban_state->count < 2*ctdb->num_nodes) {
1180 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1181 ctdb->nodes[i]->pnn, ban_state->count,
1182 ctdb->tunable.recovery_ban_period));
1183 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1184 ban_state->count = 0;
1188 if (ctdb->tunable.verify_recovery_lock != 0) {
1189 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1190 start_time = timeval_current();
1191 if (!ctdb_recovery_lock(ctdb, true)) {
1192 ctdb_set_culprit(rec, pnn);
1193 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1196 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1197 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1200 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1202 /* get a list of all databases */
1203 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1205 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1209 /* we do the db creation before we set the recovery mode, so the freeze happens
1210 on all databases we will be dealing with. */
1212 /* verify that we have all the databases any other node has */
1213 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1215 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1219 /* verify that all other nodes have all our databases */
1220 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1222 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1226 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1229 /* set recovery mode to active on all nodes */
1230 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1232 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1236 /* execute the "startrecovery" event script on all nodes */
1237 ret = run_startrecovery_eventscript(rec, nodemap);
1239 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1243 /* pick a new generation number */
1244 generation = new_generation();
1246 /* change the vnnmap on this node to use the new generation
1247 number but not on any other nodes.
1248 this guarantees that if we abort the recovery prematurely
1249 for some reason (a node stops responding?)
1250 that we can just return immediately and we will reenter
1251 recovery shortly again.
1252 I.e. we deliberately leave the cluster with an inconsistent
1253 generation id to allow us to abort recovery at any stage and
1254 just restart it from scratch.
1256 vnnmap->generation = generation;
1257 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1259 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1263 data.dptr = (void *)&generation;
1264 data.dsize = sizeof(uint32_t);
1266 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1267 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1269 CONTROL_TIMEOUT(), false, data,
1272 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1276 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1278 for (i=0;i<dbmap->num;i++) {
1279 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1280 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1285 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1287 /* commit all the changes */
1288 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1290 CONTROL_TIMEOUT(), false, data,
1293 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1297 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1300 /* update the capabilities for all nodes */
1301 ret = update_capabilities(ctdb, nodemap);
1303 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1307 /* build a new vnn map with all the currently active and
1309 generation = new_generation();
1310 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1311 CTDB_NO_MEMORY(ctdb, vnnmap);
1312 vnnmap->generation = generation;
1314 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1315 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1316 for (i=j=0;i<nodemap->num;i++) {
1317 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1320 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1321 /* this node can not be an lmaster */
1322 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1327 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1328 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1329 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1332 if (vnnmap->size == 0) {
1333 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1335 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1336 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1337 vnnmap->map[0] = pnn;
1340 /* update to the new vnnmap on all nodes */
1341 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1343 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1347 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1349 /* update recmaster to point to us for all nodes */
1350 ret = set_recovery_master(ctdb, nodemap, pnn);
1352 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1356 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1359 update all nodes to have the same flags that we have
1361 for (i=0;i<nodemap->num;i++) {
1362 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1366 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1368 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1373 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1375 /* disable recovery mode */
1376 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1378 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1382 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1385 tell nodes to takeover their public IPs
1387 rec->need_takeover_run = false;
1388 ret = ctdb_takeover_run(ctdb, nodemap);
1390 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1393 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1395 /* execute the "recovered" event script on all nodes */
1396 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1398 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1402 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1404 /* send a message to all clients telling them that the cluster
1405 has been reconfigured */
1406 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1408 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1410 rec->need_recovery = false;
1412 /* We just finished a recovery successfully.
1413 We now wait for rerecovery_timeout before we allow
1414 another recovery to take place.
1416 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1417 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1418 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1425 elections are won by first checking the number of connected nodes, then
1426 the priority time, then the pnn
1428 struct election_message {
1429 uint32_t num_connected;
1430 struct timeval priority_time;
1432 uint32_t node_flags;
1436 form this nodes election data
1438 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1441 struct ctdb_node_map *nodemap;
1442 struct ctdb_context *ctdb = rec->ctdb;
1446 em->pnn = rec->ctdb->pnn;
1447 em->priority_time = rec->priority_time;
1449 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1451 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1455 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1456 em->node_flags = rec->node_flags;
1458 for (i=0;i<nodemap->num;i++) {
1459 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1460 em->num_connected++;
1464 /* we shouldnt try to win this election if we cant be a recmaster */
1465 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1466 em->num_connected = 0;
1467 em->priority_time = timeval_current();
1470 talloc_free(nodemap);
1474 see if the given election data wins
1476 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1478 struct election_message myem;
1481 ctdb_election_data(rec, &myem);
1483 /* we cant win if we dont have the recmaster capability */
1484 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1488 /* we cant win if we are banned */
1489 if (rec->node_flags & NODE_FLAGS_BANNED) {
1493 /* we cant win if we are stopped */
1494 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1498 /* we will automatically win if the other node is banned */
1499 if (em->node_flags & NODE_FLAGS_BANNED) {
1503 /* we will automatically win if the other node is banned */
1504 if (em->node_flags & NODE_FLAGS_STOPPED) {
1508 /* try to use the most connected node */
1510 cmp = (int)myem.num_connected - (int)em->num_connected;
1513 /* then the longest running node */
1515 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1519 cmp = (int)myem.pnn - (int)em->pnn;
1526 send out an election request
1528 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1531 TDB_DATA election_data;
1532 struct election_message emsg;
1534 struct ctdb_context *ctdb = rec->ctdb;
1536 srvid = CTDB_SRVID_RECOVERY;
1538 ctdb_election_data(rec, &emsg);
1540 election_data.dsize = sizeof(struct election_message);
1541 election_data.dptr = (unsigned char *)&emsg;
1544 /* send an election message to all active nodes */
1545 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1546 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1549 /* A new node that is already frozen has entered the cluster.
1550 The existing nodes are not frozen and dont need to be frozen
1551 until the election has ended and we start the actual recovery
1553 if (update_recmaster == true) {
1554 /* first we assume we will win the election and set
1555 recoverymaster to be ourself on the current node
1557 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1559 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1569 this function will unban all nodes in the cluster
1571 static void unban_all_nodes(struct ctdb_context *ctdb)
1574 struct ctdb_node_map *nodemap;
1575 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1577 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1579 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1583 for (i=0;i<nodemap->num;i++) {
1584 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1585 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1586 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1590 talloc_free(tmp_ctx);
1595 we think we are winning the election - send a broadcast election request
1597 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1599 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1602 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1604 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1607 talloc_free(rec->send_election_te);
1608 rec->send_election_te = NULL;
1612 handler for memory dumps
1614 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1615 TDB_DATA data, void *private_data)
1617 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1620 struct rd_memdump_reply *rd;
1622 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1623 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1624 talloc_free(tmp_ctx);
1627 rd = (struct rd_memdump_reply *)data.dptr;
1629 dump = talloc_zero(tmp_ctx, TDB_DATA);
1631 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1632 talloc_free(tmp_ctx);
1635 ret = ctdb_dump_memory(ctdb, dump);
1637 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1638 talloc_free(tmp_ctx);
1642 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1644 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1646 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1647 talloc_free(tmp_ctx);
1651 talloc_free(tmp_ctx);
1655 handler for reload_nodes
1657 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1658 TDB_DATA data, void *private_data)
1660 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1662 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1664 reload_nodes_file(rec->ctdb);
1668 handler for ip reallocate, just add it to the list of callers and
1669 handle this later in the monitor_cluster loop so we do not recurse
1670 with other callers to takeover_run()
1672 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1673 TDB_DATA data, void *private_data)
1675 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1676 struct ip_reallocate_list *caller;
1678 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1679 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1683 if (rec->ip_reallocate_ctx == NULL) {
1684 rec->ip_reallocate_ctx = talloc_new(rec);
1685 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1688 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1689 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1691 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1692 caller->next = rec->reallocate_callers;
1693 rec->reallocate_callers = caller;
1698 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1700 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1703 struct ip_reallocate_list *callers;
1705 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1706 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1707 result.dsize = sizeof(int32_t);
1708 result.dptr = (uint8_t *)&ret;
1710 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1711 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1712 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1714 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1718 talloc_free(tmp_ctx);
1719 talloc_free(rec->ip_reallocate_ctx);
1720 rec->ip_reallocate_ctx = NULL;
1721 rec->reallocate_callers = NULL;
1727 handler for recovery master elections
1729 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1730 TDB_DATA data, void *private_data)
1732 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1734 struct election_message *em = (struct election_message *)data.dptr;
1735 TALLOC_CTX *mem_ctx;
1737 /* we got an election packet - update the timeout for the election */
1738 talloc_free(rec->election_timeout);
1739 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1740 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1741 ctdb_election_timeout, rec);
1743 mem_ctx = talloc_new(ctdb);
1745 /* someone called an election. check their election data
1746 and if we disagree and we would rather be the elected node,
1747 send a new election message to all other nodes
1749 if (ctdb_election_win(rec, em)) {
1750 if (!rec->send_election_te) {
1751 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1752 timeval_current_ofs(0, 500000),
1753 election_send_request, rec);
1755 talloc_free(mem_ctx);
1756 /*unban_all_nodes(ctdb);*/
1761 talloc_free(rec->send_election_te);
1762 rec->send_election_te = NULL;
1764 if (ctdb->tunable.verify_recovery_lock != 0) {
1765 /* release the recmaster lock */
1766 if (em->pnn != ctdb->pnn &&
1767 ctdb->recovery_lock_fd != -1) {
1768 close(ctdb->recovery_lock_fd);
1769 ctdb->recovery_lock_fd = -1;
1770 unban_all_nodes(ctdb);
1774 /* ok, let that guy become recmaster then */
1775 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1777 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1778 talloc_free(mem_ctx);
1782 talloc_free(mem_ctx);
1788 force the start of the election process
1790 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1791 struct ctdb_node_map *nodemap)
1794 struct ctdb_context *ctdb = rec->ctdb;
1796 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1798 /* set all nodes to recovery mode to stop all internode traffic */
1799 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1801 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1805 talloc_free(rec->election_timeout);
1806 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1807 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1808 ctdb_election_timeout, rec);
1810 ret = send_election_request(rec, pnn, true);
1812 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1816 /* wait for a few seconds to collect all responses */
1817 ctdb_wait_election(rec);
1823 handler for when a node changes its flags
1825 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1826 TDB_DATA data, void *private_data)
1829 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1830 struct ctdb_node_map *nodemap=NULL;
1831 TALLOC_CTX *tmp_ctx;
1832 uint32_t changed_flags;
1834 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1836 if (data.dsize != sizeof(*c)) {
1837 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1841 tmp_ctx = talloc_new(ctdb);
1842 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1844 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1846 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1847 talloc_free(tmp_ctx);
1852 for (i=0;i<nodemap->num;i++) {
1853 if (nodemap->nodes[i].pnn == c->pnn) break;
1856 if (i == nodemap->num) {
1857 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1858 talloc_free(tmp_ctx);
1862 changed_flags = c->old_flags ^ c->new_flags;
1864 if (nodemap->nodes[i].flags != c->new_flags) {
1865 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1868 nodemap->nodes[i].flags = c->new_flags;
1870 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1871 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1874 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1875 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1879 ctdb->recovery_master == ctdb->pnn &&
1880 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
1881 /* Only do the takeover run if the perm disabled or unhealthy
1882 flags changed since these will cause an ip failover but not
1884 If the node became disconnected or banned this will also
1885 lead to an ip address failover but that is handled
1888 if (changed_flags & NODE_FLAGS_DISABLED) {
1889 rec->need_takeover_run = true;
1893 talloc_free(tmp_ctx);
1897 handler for when we need to push out flag changes ot all other nodes
1899 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
1900 TDB_DATA data, void *private_data)
1903 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1905 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
1907 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1912 struct verify_recmode_normal_data {
1914 enum monitor_result status;
1917 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1919 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1922 /* one more node has responded with recmode data*/
1925 /* if we failed to get the recmode, then return an error and let
1926 the main loop try again.
1928 if (state->state != CTDB_CONTROL_DONE) {
1929 if (rmdata->status == MONITOR_OK) {
1930 rmdata->status = MONITOR_FAILED;
1935 /* if we got a response, then the recmode will be stored in the
1938 if (state->status != CTDB_RECOVERY_NORMAL) {
1939 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1940 rmdata->status = MONITOR_RECOVERY_NEEDED;
1947 /* verify that all nodes are in normal recovery mode */
1948 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
1950 struct verify_recmode_normal_data *rmdata;
1951 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1952 struct ctdb_client_control_state *state;
1953 enum monitor_result status;
1956 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1957 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1959 rmdata->status = MONITOR_OK;
1961 /* loop over all active nodes and send an async getrecmode call to
1963 for (j=0; j<nodemap->num; j++) {
1964 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1967 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1969 nodemap->nodes[j].pnn);
1970 if (state == NULL) {
1971 /* we failed to send the control, treat this as
1972 an error and try again next iteration
1974 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1975 talloc_free(mem_ctx);
1976 return MONITOR_FAILED;
1979 /* set up the callback functions */
1980 state->async.fn = verify_recmode_normal_callback;
1981 state->async.private_data = rmdata;
1983 /* one more control to wait for to complete */
1988 /* now wait for up to the maximum number of seconds allowed
1989 or until all nodes we expect a response from has replied
1991 while (rmdata->count > 0) {
1992 event_loop_once(ctdb->ev);
1995 status = rmdata->status;
1996 talloc_free(mem_ctx);
2001 struct verify_recmaster_data {
2002 struct ctdb_recoverd *rec;
2005 enum monitor_result status;
2008 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2010 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2013 /* one more node has responded with recmaster data*/
2016 /* if we failed to get the recmaster, then return an error and let
2017 the main loop try again.
2019 if (state->state != CTDB_CONTROL_DONE) {
2020 if (rmdata->status == MONITOR_OK) {
2021 rmdata->status = MONITOR_FAILED;
2026 /* if we got a response, then the recmaster will be stored in the
2029 if (state->status != rmdata->pnn) {
2030 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2031 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2032 rmdata->status = MONITOR_ELECTION_NEEDED;
2039 /* verify that all nodes agree that we are the recmaster */
2040 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2042 struct ctdb_context *ctdb = rec->ctdb;
2043 struct verify_recmaster_data *rmdata;
2044 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2045 struct ctdb_client_control_state *state;
2046 enum monitor_result status;
2049 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2050 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2054 rmdata->status = MONITOR_OK;
2056 /* loop over all active nodes and send an async getrecmaster call to
2058 for (j=0; j<nodemap->num; j++) {
2059 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2062 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2064 nodemap->nodes[j].pnn);
2065 if (state == NULL) {
2066 /* we failed to send the control, treat this as
2067 an error and try again next iteration
2069 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2070 talloc_free(mem_ctx);
2071 return MONITOR_FAILED;
2074 /* set up the callback functions */
2075 state->async.fn = verify_recmaster_callback;
2076 state->async.private_data = rmdata;
2078 /* one more control to wait for to complete */
2083 /* now wait for up to the maximum number of seconds allowed
2084 or until all nodes we expect a response from has replied
2086 while (rmdata->count > 0) {
2087 event_loop_once(ctdb->ev);
2090 status = rmdata->status;
2091 talloc_free(mem_ctx);
2096 /* called to check that the allocation of public ip addresses is ok.
2098 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2100 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2101 struct ctdb_all_public_ips *ips = NULL;
2102 struct ctdb_uptime *uptime1 = NULL;
2103 struct ctdb_uptime *uptime2 = NULL;
2106 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2107 CTDB_CURRENT_NODE, &uptime1);
2109 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2110 talloc_free(mem_ctx);
2114 /* read the ip allocation from the local node */
2115 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2117 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2118 talloc_free(mem_ctx);
2122 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2123 CTDB_CURRENT_NODE, &uptime2);
2125 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2126 talloc_free(mem_ctx);
2130 /* skip the check if the startrecovery time has changed */
2131 if (timeval_compare(&uptime1->last_recovery_started,
2132 &uptime2->last_recovery_started) != 0) {
2133 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2134 talloc_free(mem_ctx);
2138 /* skip the check if the endrecovery time has changed */
2139 if (timeval_compare(&uptime1->last_recovery_finished,
2140 &uptime2->last_recovery_finished) != 0) {
2141 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2142 talloc_free(mem_ctx);
2146 /* skip the check if we have started but not finished recovery */
2147 if (timeval_compare(&uptime1->last_recovery_finished,
2148 &uptime1->last_recovery_started) != 1) {
2149 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2150 talloc_free(mem_ctx);
2155 /* verify that we have the ip addresses we should have
2156 and we dont have ones we shouldnt have.
2157 if we find an inconsistency we set recmode to
2158 active on the local node and wait for the recmaster
2159 to do a full blown recovery
2161 for (j=0; j<ips->num; j++) {
2162 if (ips->ips[j].pnn == pnn) {
2163 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2164 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2165 ctdb_addr_to_str(&ips->ips[j].addr)));
2166 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2168 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2170 talloc_free(mem_ctx);
2173 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2175 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2177 talloc_free(mem_ctx);
2182 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2183 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2184 ctdb_addr_to_str(&ips->ips[j].addr)));
2186 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2188 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2190 talloc_free(mem_ctx);
2193 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2195 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2197 talloc_free(mem_ctx);
2204 talloc_free(mem_ctx);
2209 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2211 struct ctdb_node_map **remote_nodemaps = callback_data;
2213 if (node_pnn >= ctdb->num_nodes) {
2214 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2218 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2222 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2223 struct ctdb_node_map *nodemap,
2224 struct ctdb_node_map **remote_nodemaps)
2228 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2229 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2231 CONTROL_TIMEOUT(), false, tdb_null,
2232 async_getnodemap_callback,
2234 remote_nodemaps) != 0) {
2235 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2243 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2244 struct ctdb_check_reclock_state {
2245 struct ctdb_context *ctdb;
2246 struct timeval start_time;
2249 struct timed_event *te;
2250 struct fd_event *fde;
2251 enum reclock_child_status status;
2254 /* when we free the reclock state we must kill any child process.
2256 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2258 struct ctdb_context *ctdb = state->ctdb;
2260 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2262 if (state->fd[0] != -1) {
2263 close(state->fd[0]);
2266 if (state->fd[1] != -1) {
2267 close(state->fd[1]);
2270 kill(state->child, SIGKILL);
2275 called if our check_reclock child times out. this would happen if
2276 i/o to the reclock file blocks.
2278 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2279 struct timeval t, void *private_data)
2281 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2282 struct ctdb_check_reclock_state);
2284 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2285 state->status = RECLOCK_TIMEOUT;
2288 /* this is called when the child process has completed checking the reclock
2289 file and has written data back to us through the pipe.
2291 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2292 uint16_t flags, void *private_data)
2294 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2295 struct ctdb_check_reclock_state);
2299 /* we got a response from our child process so we can abort the
2302 talloc_free(state->te);
2305 ret = read(state->fd[0], &c, 1);
2306 if (ret != 1 || c != RECLOCK_OK) {
2307 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2308 state->status = RECLOCK_FAILED;
2313 state->status = RECLOCK_OK;
2317 static int check_recovery_lock(struct ctdb_context *ctdb)
2320 struct ctdb_check_reclock_state *state;
2321 pid_t parent = getpid();
2323 if (ctdb->recovery_lock_fd == -1) {
2324 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2328 state = talloc(ctdb, struct ctdb_check_reclock_state);
2329 CTDB_NO_MEMORY(ctdb, state);
2332 state->start_time = timeval_current();
2333 state->status = RECLOCK_CHECKING;
2337 ret = pipe(state->fd);
2340 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2344 state->child = fork();
2345 if (state->child == (pid_t)-1) {
2346 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2347 close(state->fd[0]);
2349 close(state->fd[1]);
2355 if (state->child == 0) {
2356 char cc = RECLOCK_OK;
2357 close(state->fd[0]);
2360 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2361 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2362 cc = RECLOCK_FAILED;
2365 write(state->fd[1], &cc, 1);
2366 /* make sure we die when our parent dies */
2367 while (kill(parent, 0) == 0 || errno != ESRCH) {
2369 write(state->fd[1], &cc, 1);
2373 close(state->fd[1]);
2376 talloc_set_destructor(state, check_reclock_destructor);
2378 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2379 ctdb_check_reclock_timeout, state);
2380 if (state->te == NULL) {
2381 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2386 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2387 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2388 reclock_child_handler,
2391 if (state->fde == NULL) {
2392 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2397 while (state->status == RECLOCK_CHECKING) {
2398 event_loop_once(ctdb->ev);
2401 if (state->status == RECLOCK_FAILED) {
2402 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2403 close(ctdb->recovery_lock_fd);
2404 ctdb->recovery_lock_fd = -1;
2413 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2415 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2416 const char *reclockfile;
2418 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2419 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2420 talloc_free(tmp_ctx);
2424 if (reclockfile == NULL) {
2425 if (ctdb->recovery_lock_file != NULL) {
2426 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2427 talloc_free(ctdb->recovery_lock_file);
2428 ctdb->recovery_lock_file = NULL;
2429 if (ctdb->recovery_lock_fd != -1) {
2430 close(ctdb->recovery_lock_fd);
2431 ctdb->recovery_lock_fd = -1;
2434 ctdb->tunable.verify_recovery_lock = 0;
2435 talloc_free(tmp_ctx);
2439 if (ctdb->recovery_lock_file == NULL) {
2440 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2441 if (ctdb->recovery_lock_fd != -1) {
2442 close(ctdb->recovery_lock_fd);
2443 ctdb->recovery_lock_fd = -1;
2445 talloc_free(tmp_ctx);
2450 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2451 talloc_free(tmp_ctx);
2455 talloc_free(ctdb->recovery_lock_file);
2456 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2457 ctdb->tunable.verify_recovery_lock = 0;
2458 if (ctdb->recovery_lock_fd != -1) {
2459 close(ctdb->recovery_lock_fd);
2460 ctdb->recovery_lock_fd = -1;
2463 talloc_free(tmp_ctx);
2468 the main monitoring loop
2470 static void monitor_cluster(struct ctdb_context *ctdb)
2473 TALLOC_CTX *mem_ctx=NULL;
2474 struct ctdb_node_map *nodemap=NULL;
2475 struct ctdb_node_map *recmaster_nodemap=NULL;
2476 struct ctdb_node_map **remote_nodemaps=NULL;
2477 struct ctdb_vnn_map *vnnmap=NULL;
2478 struct ctdb_vnn_map *remote_vnnmap=NULL;
2479 int32_t debug_level;
2481 struct ctdb_recoverd *rec;
2483 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2485 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2486 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2490 rec->priority_time = timeval_current();
2492 /* register a message port for sending memory dumps */
2493 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2495 /* register a message port for recovery elections */
2496 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2498 /* when nodes are disabled/enabled */
2499 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2501 /* when we are asked to puch out a flag change */
2502 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2504 /* register a message port for vacuum fetch */
2505 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2507 /* register a message port for reloadnodes */
2508 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2510 /* register a message port for performing a takeover run */
2511 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2515 talloc_free(mem_ctx);
2518 mem_ctx = talloc_new(ctdb);
2520 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2524 /* we only check for recovery once every second */
2525 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2527 /* verify that the main daemon is still running */
2528 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2529 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2533 /* ping the local daemon to tell it we are alive */
2534 ctdb_ctrl_recd_ping(ctdb);
2536 if (rec->election_timeout) {
2537 /* an election is in progress */
2541 /* read the debug level from the parent and update locally */
2542 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2544 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2547 LogLevel = debug_level;
2550 /* We must check if we need to ban a node here but we want to do this
2551 as early as possible so we dont wait until we have pulled the node
2552 map from the local node. thats why we have the hardcoded value 20
2554 for (i=0; i<ctdb->num_nodes; i++) {
2555 struct ctdb_banning_state *ban_state;
2557 if (ctdb->nodes[i]->ban_state == NULL) {
2560 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2561 if (ban_state->count < 20) {
2564 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2565 ctdb->nodes[i]->pnn, ban_state->count,
2566 ctdb->tunable.recovery_ban_period));
2567 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2568 ban_state->count = 0;
2571 /* get relevant tunables */
2572 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2574 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2578 /* get the current recovery lock file from the server */
2579 if (update_recovery_lock_file(ctdb) != 0) {
2580 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2584 /* Make sure that if recovery lock verification becomes disabled when
2587 if (ctdb->tunable.verify_recovery_lock == 0) {
2588 if (ctdb->recovery_lock_fd != -1) {
2589 close(ctdb->recovery_lock_fd);
2590 ctdb->recovery_lock_fd = -1;
2594 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2595 if (pnn == (uint32_t)-1) {
2596 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2600 /* get the vnnmap */
2601 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2603 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2608 /* get number of nodes */
2610 talloc_free(rec->nodemap);
2611 rec->nodemap = NULL;
2614 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2616 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2619 nodemap = rec->nodemap;
2621 /* check which node is the recovery master */
2622 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2624 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2628 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2629 if (rec->recmaster != pnn) {
2630 if (rec->ip_reallocate_ctx != NULL) {
2631 talloc_free(rec->ip_reallocate_ctx);
2632 rec->ip_reallocate_ctx = NULL;
2633 rec->reallocate_callers = NULL;
2636 /* if there are takeovers requested, perform it and notify the waiters */
2637 if (rec->reallocate_callers) {
2638 process_ipreallocate_requests(ctdb, rec);
2641 if (rec->recmaster == (uint32_t)-1) {
2642 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2643 force_election(rec, pnn, nodemap);
2648 /* if the local daemon is STOPPED, we verify that the databases are
2649 also frozen and thet the recmode is set to active
2651 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2652 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2654 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2656 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2657 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2659 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2661 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2664 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2666 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2673 /* If the local node is stopped, verify we are not the recmaster
2674 and yield this role if so
2676 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2677 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2678 force_election(rec, pnn, nodemap);
2682 /* check that we (recovery daemon) and the local ctdb daemon
2683 agrees on whether we are banned or not
2687 /* remember our own node flags */
2688 rec->node_flags = nodemap->nodes[pnn].flags;
2690 /* count how many active nodes there are */
2691 rec->num_active = 0;
2692 rec->num_connected = 0;
2693 for (i=0; i<nodemap->num; i++) {
2694 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2697 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2698 rec->num_connected++;
2703 /* verify that the recmaster node is still active */
2704 for (j=0; j<nodemap->num; j++) {
2705 if (nodemap->nodes[j].pnn==rec->recmaster) {
2710 if (j == nodemap->num) {
2711 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2712 force_election(rec, pnn, nodemap);
2716 /* if recovery master is disconnected we must elect a new recmaster */
2717 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2718 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2719 force_election(rec, pnn, nodemap);
2723 /* grap the nodemap from the recovery master to check if it is banned */
2724 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2725 mem_ctx, &recmaster_nodemap);
2727 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2728 nodemap->nodes[j].pnn));
2733 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2734 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2735 force_election(rec, pnn, nodemap);
2740 /* verify that we have all ip addresses we should have and we dont
2741 * have addresses we shouldnt have.
2743 if (ctdb->do_checkpublicip) {
2744 if (verify_ip_allocation(ctdb, pnn) != 0) {
2745 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2751 /* if we are not the recmaster then we do not need to check
2752 if recovery is needed
2754 if (pnn != rec->recmaster) {
2759 /* ensure our local copies of flags are right */
2760 ret = update_local_flags(rec, nodemap);
2761 if (ret == MONITOR_ELECTION_NEEDED) {
2762 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2763 force_election(rec, pnn, nodemap);
2766 if (ret != MONITOR_OK) {
2767 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2771 /* update the list of public ips that a node can handle for
2774 if (ctdb->num_nodes != nodemap->num) {
2775 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2776 reload_nodes_file(ctdb);
2779 for (j=0; j<nodemap->num; j++) {
2780 /* release any existing data */
2781 if (ctdb->nodes[j]->public_ips) {
2782 talloc_free(ctdb->nodes[j]->public_ips);
2783 ctdb->nodes[j]->public_ips = NULL;
2786 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2790 /* grab a new shiny list of public ips from the node */
2791 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2792 ctdb->nodes[j]->pnn,
2794 &ctdb->nodes[j]->public_ips)) {
2795 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2796 ctdb->nodes[j]->pnn));
2802 /* verify that all active nodes agree that we are the recmaster */
2803 switch (verify_recmaster(rec, nodemap, pnn)) {
2804 case MONITOR_RECOVERY_NEEDED:
2805 /* can not happen */
2807 case MONITOR_ELECTION_NEEDED:
2808 force_election(rec, pnn, nodemap);
2812 case MONITOR_FAILED:
2817 if (rec->need_recovery) {
2818 /* a previous recovery didn't finish */
2819 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2823 /* verify that all active nodes are in normal mode
2824 and not in recovery mode
2826 switch (verify_recmode(ctdb, nodemap)) {
2827 case MONITOR_RECOVERY_NEEDED:
2828 ctdb_set_culprit(rec, ctdb->pnn);
2829 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2831 case MONITOR_FAILED:
2833 case MONITOR_ELECTION_NEEDED:
2834 /* can not happen */
2840 if (ctdb->tunable.verify_recovery_lock != 0) {
2841 /* we should have the reclock - check its not stale */
2842 ret = check_recovery_lock(ctdb);
2844 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2845 ctdb_set_culprit(rec, ctdb->pnn);
2846 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2851 /* get the nodemap for all active remote nodes
2853 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
2854 if (remote_nodemaps == NULL) {
2855 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2858 for(i=0; i<nodemap->num; i++) {
2859 remote_nodemaps[i] = NULL;
2861 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2862 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2866 /* verify that all other nodes have the same nodemap as we have
2868 for (j=0; j<nodemap->num; j++) {
2869 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2873 if (remote_nodemaps[j] == NULL) {
2874 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2875 ctdb_set_culprit(rec, j);
2880 /* if the nodes disagree on how many nodes there are
2881 then this is a good reason to try recovery
2883 if (remote_nodemaps[j]->num != nodemap->num) {
2884 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2885 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2886 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2887 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2891 /* if the nodes disagree on which nodes exist and are
2892 active, then that is also a good reason to do recovery
2894 for (i=0;i<nodemap->num;i++) {
2895 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2896 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2897 nodemap->nodes[j].pnn, i,
2898 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2899 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2900 do_recovery(rec, mem_ctx, pnn, nodemap,
2906 /* verify the flags are consistent
2908 for (i=0; i<nodemap->num; i++) {
2909 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2913 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2914 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2915 nodemap->nodes[j].pnn,
2916 nodemap->nodes[i].pnn,
2917 remote_nodemaps[j]->nodes[i].flags,
2918 nodemap->nodes[j].flags));
2920 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2921 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2922 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2923 do_recovery(rec, mem_ctx, pnn, nodemap,
2927 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2928 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2929 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2930 do_recovery(rec, mem_ctx, pnn, nodemap,
2939 /* there better be the same number of lmasters in the vnn map
2940 as there are active nodes or we will have to do a recovery
2942 if (vnnmap->size != rec->num_active) {
2943 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
2944 vnnmap->size, rec->num_active));
2945 ctdb_set_culprit(rec, ctdb->pnn);
2946 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2950 /* verify that all active nodes in the nodemap also exist in
2953 for (j=0; j<nodemap->num; j++) {
2954 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2957 if (nodemap->nodes[j].pnn == pnn) {
2961 for (i=0; i<vnnmap->size; i++) {
2962 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2966 if (i == vnnmap->size) {
2967 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2968 nodemap->nodes[j].pnn));
2969 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2970 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2976 /* verify that all other nodes have the same vnnmap
2977 and are from the same generation
2979 for (j=0; j<nodemap->num; j++) {
2980 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2983 if (nodemap->nodes[j].pnn == pnn) {
2987 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2988 mem_ctx, &remote_vnnmap);
2990 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2991 nodemap->nodes[j].pnn));
2995 /* verify the vnnmap generation is the same */
2996 if (vnnmap->generation != remote_vnnmap->generation) {
2997 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2998 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2999 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3000 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3004 /* verify the vnnmap size is the same */
3005 if (vnnmap->size != remote_vnnmap->size) {
3006 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3007 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3008 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3009 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3013 /* verify the vnnmap is the same */
3014 for (i=0;i<vnnmap->size;i++) {
3015 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3016 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3017 nodemap->nodes[j].pnn));
3018 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3019 do_recovery(rec, mem_ctx, pnn, nodemap,
3026 /* we might need to change who has what IP assigned */
3027 if (rec->need_takeover_run) {
3028 rec->need_takeover_run = false;
3030 /* execute the "startrecovery" event script on all nodes */
3031 ret = run_startrecovery_eventscript(rec, nodemap);
3033 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3034 ctdb_set_culprit(rec, ctdb->pnn);
3035 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3038 ret = ctdb_takeover_run(ctdb, nodemap);
3040 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3041 ctdb_set_culprit(rec, ctdb->pnn);
3042 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3045 /* execute the "recovered" event script on all nodes */
3046 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3048 // we cant check whether the event completed successfully
3049 // since this script WILL fail if the node is in recovery mode
3050 // and if that race happens, the code here would just cause a second
3051 // cascading recovery.
3053 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3054 ctdb_set_culprit(rec, ctdb->pnn);
3055 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3066 event handler for when the main ctdbd dies
3068 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3069 uint16_t flags, void *private_data)
3071 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3076 called regularly to verify that the recovery daemon is still running
3078 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3079 struct timeval yt, void *p)
3081 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3083 if (kill(ctdb->recoverd_pid, 0) != 0) {
3084 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3086 ctdb_stop_recoverd(ctdb);
3087 ctdb_stop_keepalive(ctdb);
3088 ctdb_stop_monitoring(ctdb);
3089 ctdb_release_all_ips(ctdb);
3090 if (ctdb->methods != NULL) {
3091 ctdb->methods->shutdown(ctdb);
3093 ctdb_event_script(ctdb, "shutdown");
3098 event_add_timed(ctdb->ev, ctdb,
3099 timeval_current_ofs(30, 0),
3100 ctdb_check_recd, ctdb);
3103 static void recd_sig_child_handler(struct event_context *ev,
3104 struct signal_event *se, int signum, int count,
3108 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3113 pid = waitpid(-1, &status, WNOHANG);
3115 if (errno != ECHILD) {
3116 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3121 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3127 startup the recovery daemon as a child of the main ctdb daemon
3129 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3132 struct signal_event *se;
3134 if (pipe(fd) != 0) {
3138 ctdb->ctdbd_pid = getpid();
3140 ctdb->recoverd_pid = fork();
3141 if (ctdb->recoverd_pid == -1) {
3145 if (ctdb->recoverd_pid != 0) {
3147 event_add_timed(ctdb->ev, ctdb,
3148 timeval_current_ofs(30, 0),
3149 ctdb_check_recd, ctdb);
3155 srandom(getpid() ^ time(NULL));
3157 if (switch_from_server_to_client(ctdb) != 0) {
3158 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3162 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3163 ctdb_recoverd_parent, &fd[0]);
3165 /* set up a handler to pick up sigchld */
3166 se = event_add_signal(ctdb->ev, ctdb,
3168 recd_sig_child_handler,
3171 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3175 monitor_cluster(ctdb);
3177 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3182 shutdown the recovery daemon
3184 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3186 if (ctdb->recoverd_pid == 0) {
3190 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3191 kill(ctdb->recoverd_pid, SIGTERM);