4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
35 struct ctdb_recoverd *rec;
40 private state of recovery daemon
42 struct ctdb_recoverd {
43 struct ctdb_context *ctdb;
47 uint32_t num_connected;
48 struct ctdb_node_map *nodemap;
49 uint32_t last_culprit;
50 uint32_t culprit_counter;
51 struct timeval first_recover_time;
52 struct ban_state **banned_nodes;
53 struct timeval priority_time;
54 bool need_takeover_run;
57 struct timed_event *send_election_te;
58 struct timed_event *election_timeout;
59 struct vacuum_info *vacuum_info;
62 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
63 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
69 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
71 struct ctdb_context *ctdb = rec->ctdb;
73 DEBUG(DEBUG_NOTICE,("Unbanning node %u\n", pnn));
75 if (!ctdb_validate_pnn(ctdb, pnn)) {
76 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_unban_node\n", pnn));
80 /* If we are unbanning a different node then just pass the ban info on */
81 if (pnn != ctdb->pnn) {
85 DEBUG(DEBUG_NOTICE,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
87 data.dptr = (uint8_t *)&pnn;
88 data.dsize = sizeof(uint32_t);
90 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
92 DEBUG(DEBUG_ERR,("Failed to unban node %u\n", pnn));
99 /* make sure we remember we are no longer banned in case
100 there is an election */
101 rec->node_flags &= ~NODE_FLAGS_BANNED;
103 DEBUG(DEBUG_INFO,("Clearing ban flag on node %u\n", pnn));
104 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
106 if (rec->banned_nodes[pnn] == NULL) {
107 DEBUG(DEBUG_INFO,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
111 talloc_free(rec->banned_nodes[pnn]);
112 rec->banned_nodes[pnn] = NULL;
117 called when a ban has timed out
119 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
121 struct ban_state *state = talloc_get_type(p, struct ban_state);
122 struct ctdb_recoverd *rec = state->rec;
123 uint32_t pnn = state->banned_node;
125 DEBUG(DEBUG_NOTICE,("Ban timeout. Node %u is now unbanned\n", pnn));
126 ctdb_unban_node(rec, pnn);
130 ban a node for a period of time
132 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
134 struct ctdb_context *ctdb = rec->ctdb;
136 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
138 if (!ctdb_validate_pnn(ctdb, pnn)) {
139 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
143 if (0 == ctdb->tunable.enable_bans) {
144 DEBUG(DEBUG_INFO,("Bans are disabled - ignoring ban of node %u\n", pnn));
148 /* If we are banning a different node then just pass the ban info on */
149 if (pnn != ctdb->pnn) {
150 struct ctdb_ban_info b;
154 DEBUG(DEBUG_NOTICE,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
157 b.ban_time = ban_time;
159 data.dptr = (uint8_t *)&b;
160 data.dsize = sizeof(b);
162 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
164 DEBUG(DEBUG_ERR,("Failed to ban node %u\n", pnn));
171 DEBUG(DEBUG_NOTICE,("self ban - lowering our election priority\n"));
172 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
174 /* banning ourselves - lower our election priority */
175 rec->priority_time = timeval_current();
177 /* make sure we remember we are banned in case there is an
179 rec->node_flags |= NODE_FLAGS_BANNED;
181 if (rec->banned_nodes[pnn] != NULL) {
182 DEBUG(DEBUG_NOTICE,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
183 talloc_free(rec->banned_nodes[pnn]);
184 rec->banned_nodes[pnn] = NULL;
187 rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
188 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
190 rec->banned_nodes[pnn]->rec = rec;
191 rec->banned_nodes[pnn]->banned_node = pnn;
194 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
195 timeval_current_ofs(ban_time, 0),
196 ctdb_ban_timeout, rec->banned_nodes[pnn]);
200 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
204 run the "recovered" eventscript on all nodes
206 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
211 tmp_ctx = talloc_new(ctdb);
212 CTDB_NO_MEMORY(ctdb, tmp_ctx);
214 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
215 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
217 CONTROL_TIMEOUT(), false, tdb_null,
220 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
222 talloc_free(tmp_ctx);
226 talloc_free(tmp_ctx);
231 remember the trouble maker
233 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
235 struct ctdb_context *ctdb = rec->ctdb;
237 if (rec->last_culprit != culprit ||
238 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
239 DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
240 /* either a new node is the culprit, or we've decided to forgive them */
241 rec->last_culprit = culprit;
242 rec->first_recover_time = timeval_current();
243 rec->culprit_counter = 0;
245 rec->culprit_counter++;
249 /* this callback is called for every node that failed to execute the
252 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
254 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
256 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
258 ctdb_set_culprit(rec, node_pnn);
262 run the "startrecovery" eventscript on all nodes
264 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
268 struct ctdb_context *ctdb = rec->ctdb;
270 tmp_ctx = talloc_new(ctdb);
271 CTDB_NO_MEMORY(ctdb, tmp_ctx);
273 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
274 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
276 CONTROL_TIMEOUT(), false, tdb_null,
278 startrecovery_fail_callback,
280 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
281 talloc_free(tmp_ctx);
285 talloc_free(tmp_ctx);
289 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
291 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
292 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
295 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
299 update the node capabilities for all connected nodes
301 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
306 tmp_ctx = talloc_new(ctdb);
307 CTDB_NO_MEMORY(ctdb, tmp_ctx);
309 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
310 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
311 nodes, CONTROL_TIMEOUT(),
313 async_getcap_callback, NULL,
315 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
316 talloc_free(tmp_ctx);
320 talloc_free(tmp_ctx);
325 change recovery mode on all nodes
327 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
333 tmp_ctx = talloc_new(ctdb);
334 CTDB_NO_MEMORY(ctdb, tmp_ctx);
336 /* freeze all nodes */
337 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
338 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
339 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
340 nodes, CONTROL_TIMEOUT(),
344 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
345 talloc_free(tmp_ctx);
351 data.dsize = sizeof(uint32_t);
352 data.dptr = (unsigned char *)&rec_mode;
354 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
355 nodes, CONTROL_TIMEOUT(),
359 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
360 talloc_free(tmp_ctx);
364 talloc_free(tmp_ctx);
369 change recovery master on all node
371 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
377 tmp_ctx = talloc_new(ctdb);
378 CTDB_NO_MEMORY(ctdb, tmp_ctx);
380 data.dsize = sizeof(uint32_t);
381 data.dptr = (unsigned char *)&pnn;
383 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
384 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
386 CONTROL_TIMEOUT(), false, data,
389 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
390 talloc_free(tmp_ctx);
394 talloc_free(tmp_ctx);
400 ensure all other nodes have attached to any databases that we have
402 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
403 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
406 struct ctdb_dbid_map *remote_dbmap;
408 /* verify that all other nodes have all our databases */
409 for (j=0; j<nodemap->num; j++) {
410 /* we dont need to ourself ourselves */
411 if (nodemap->nodes[j].pnn == pnn) {
414 /* dont check nodes that are unavailable */
415 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
419 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
420 mem_ctx, &remote_dbmap);
422 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
426 /* step through all local databases */
427 for (db=0; db<dbmap->num;db++) {
431 for (i=0;i<remote_dbmap->num;i++) {
432 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
436 /* the remote node already have this database */
437 if (i!=remote_dbmap->num) {
440 /* ok so we need to create this database */
441 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
444 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
447 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, name, dbmap->dbs[db].persistent);
450 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
461 ensure we are attached to any databases that anyone else is attached to
463 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
464 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
467 struct ctdb_dbid_map *remote_dbmap;
469 /* verify that we have all database any other node has */
470 for (j=0; j<nodemap->num; j++) {
471 /* we dont need to ourself ourselves */
472 if (nodemap->nodes[j].pnn == pnn) {
475 /* dont check nodes that are unavailable */
476 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
480 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
481 mem_ctx, &remote_dbmap);
483 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
487 /* step through all databases on the remote node */
488 for (db=0; db<remote_dbmap->num;db++) {
491 for (i=0;i<(*dbmap)->num;i++) {
492 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
496 /* we already have this db locally */
497 if (i!=(*dbmap)->num) {
500 /* ok so we need to create this database and
503 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
504 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
506 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
507 nodemap->nodes[j].pnn));
510 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
511 remote_dbmap->dbs[db].persistent);
513 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
516 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
518 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
529 pull the remote database contents from one node into the recdb
531 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
532 struct tdb_wrap *recdb, uint32_t dbid)
536 struct ctdb_control_pulldb_reply *reply;
537 struct ctdb_rec_data *rec;
539 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
541 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
542 CONTROL_TIMEOUT(), &outdata);
544 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
545 talloc_free(tmp_ctx);
549 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
551 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
552 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
553 talloc_free(tmp_ctx);
557 rec = (struct ctdb_rec_data *)&reply->data[0];
561 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
563 struct ctdb_ltdb_header *hdr;
566 key.dptr = &rec->data[0];
567 key.dsize = rec->keylen;
568 data.dptr = &rec->data[key.dsize];
569 data.dsize = rec->datalen;
571 hdr = (struct ctdb_ltdb_header *)data.dptr;
573 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
574 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
575 talloc_free(tmp_ctx);
579 /* fetch the existing record, if any */
580 existing = tdb_fetch(recdb->tdb, key);
582 if (existing.dptr != NULL) {
583 struct ctdb_ltdb_header header;
584 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
585 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
586 (unsigned)existing.dsize, srcnode));
588 talloc_free(tmp_ctx);
591 header = *(struct ctdb_ltdb_header *)existing.dptr;
593 if (!(header.rsn < hdr->rsn ||
594 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
599 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
600 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
601 talloc_free(tmp_ctx);
606 talloc_free(tmp_ctx);
612 pull all the remote database contents into the recdb
614 static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
615 struct tdb_wrap *recdb, uint32_t dbid)
619 /* pull all records from all other nodes across onto this node
620 (this merges based on rsn)
622 for (j=0; j<nodemap->num; j++) {
623 /* dont merge from nodes that are unavailable */
624 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
627 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
628 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
629 nodemap->nodes[j].pnn));
639 update flags on all active nodes
641 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
644 for (i=0;i<nodemap->num;i++) {
645 struct ctdb_node_flag_change c;
648 c.pnn = nodemap->nodes[i].pnn;
649 c.old_flags = nodemap->nodes[i].flags;
650 c.new_flags = nodemap->nodes[i].flags;
652 data.dptr = (uint8_t *)&c;
653 data.dsize = sizeof(c);
655 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
656 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
662 static int update_our_flags_on_all_nodes(struct ctdb_context *ctdb, uint32_t pnn, struct ctdb_node_map *nodemap)
664 struct ctdb_node_flag_change c;
667 c.pnn = nodemap->nodes[pnn].pnn;
668 c.old_flags = nodemap->nodes[pnn].flags;
669 c.new_flags = nodemap->nodes[pnn].flags;
671 data.dptr = (uint8_t *)&c;
672 data.dsize = sizeof(c);
674 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
675 CTDB_SRVID_NODE_FLAGS_CHANGED, data);
681 ensure all nodes have the same vnnmap we do
683 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
684 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
688 /* push the new vnn map out to all the nodes */
689 for (j=0; j<nodemap->num; j++) {
690 /* dont push to nodes that are unavailable */
691 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
695 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
697 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
707 handler for when the admin bans a node
709 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
710 TDB_DATA data, void *private_data)
712 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
713 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
714 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
716 if (data.dsize != sizeof(*b)) {
717 DEBUG(DEBUG_ERR,("Bad data in ban_handler\n"));
718 talloc_free(mem_ctx);
722 if (b->pnn != ctdb->pnn) {
723 DEBUG(DEBUG_ERR,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
727 DEBUG(DEBUG_NOTICE,("Node %u has been banned for %u seconds\n",
728 b->pnn, b->ban_time));
730 ctdb_ban_node(rec, b->pnn, b->ban_time);
731 talloc_free(mem_ctx);
735 handler for when the admin unbans a node
737 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
738 TDB_DATA data, void *private_data)
740 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
741 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
744 if (data.dsize != sizeof(uint32_t)) {
745 DEBUG(DEBUG_ERR,("Bad data in unban_handler\n"));
746 talloc_free(mem_ctx);
749 pnn = *(uint32_t *)data.dptr;
751 if (pnn != ctdb->pnn) {
752 DEBUG(DEBUG_ERR,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
756 DEBUG(DEBUG_NOTICE,("Node %u has been unbanned.\n", pnn));
757 ctdb_unban_node(rec, pnn);
758 talloc_free(mem_ctx);
763 struct vacuum_info *next, *prev;
764 struct ctdb_recoverd *rec;
766 struct ctdb_db_context *ctdb_db;
767 struct ctdb_marshall_buffer *recs;
768 struct ctdb_rec_data *r;
771 static void vacuum_fetch_next(struct vacuum_info *v);
774 called when a vacuum fetch has completed - just free it and do the next one
776 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
778 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
780 vacuum_fetch_next(v);
785 process the next element from the vacuum list
787 static void vacuum_fetch_next(struct vacuum_info *v)
789 struct ctdb_call call;
790 struct ctdb_rec_data *r;
792 while (v->recs->count) {
793 struct ctdb_client_call_state *state;
795 struct ctdb_ltdb_header *hdr;
798 call.call_id = CTDB_NULL_FUNC;
799 call.flags = CTDB_IMMEDIATE_MIGRATION;
802 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
805 call.key.dptr = &r->data[0];
806 call.key.dsize = r->keylen;
808 /* ensure we don't block this daemon - just skip a record if we can't get
810 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
814 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
815 if (data.dptr == NULL) {
816 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
820 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
822 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
826 hdr = (struct ctdb_ltdb_header *)data.dptr;
827 if (hdr->dmaster == v->rec->ctdb->pnn) {
828 /* its already local */
830 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
836 state = ctdb_call_send(v->ctdb_db, &call);
837 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
839 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
843 state->async.fn = vacuum_fetch_callback;
844 state->async.private_data = v;
853 destroy a vacuum info structure
855 static int vacuum_info_destructor(struct vacuum_info *v)
857 DLIST_REMOVE(v->rec->vacuum_info, v);
863 handler for vacuum fetch
865 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
866 TDB_DATA data, void *private_data)
868 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
869 struct ctdb_marshall_buffer *recs;
871 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
873 struct ctdb_dbid_map *dbmap=NULL;
874 bool persistent = false;
875 struct ctdb_db_context *ctdb_db;
876 struct ctdb_rec_data *r;
878 struct vacuum_info *v;
880 recs = (struct ctdb_marshall_buffer *)data.dptr;
881 r = (struct ctdb_rec_data *)&recs->data[0];
883 if (recs->count == 0) {
889 for (v=rec->vacuum_info;v;v=v->next) {
890 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
891 /* we're already working on records from this node */
896 /* work out if the database is persistent */
897 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
899 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
900 talloc_free(tmp_ctx);
904 for (i=0;i<dbmap->num;i++) {
905 if (dbmap->dbs[i].dbid == recs->db_id) {
906 persistent = dbmap->dbs[i].persistent;
910 if (i == dbmap->num) {
911 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
912 talloc_free(tmp_ctx);
916 /* find the name of this database */
917 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
918 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
919 talloc_free(tmp_ctx);
924 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
925 if (ctdb_db == NULL) {
926 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
927 talloc_free(tmp_ctx);
931 v = talloc_zero(rec, struct vacuum_info);
933 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
938 v->srcnode = srcnode;
939 v->ctdb_db = ctdb_db;
940 v->recs = talloc_memdup(v, recs, data.dsize);
941 if (v->recs == NULL) {
942 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
946 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
948 DLIST_ADD(rec->vacuum_info, v);
950 talloc_set_destructor(v, vacuum_info_destructor);
952 vacuum_fetch_next(v);
957 called when ctdb_wait_timeout should finish
959 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
960 struct timeval yt, void *p)
962 uint32_t *timed_out = (uint32_t *)p;
967 wait for a given number of seconds
969 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
971 uint32_t timed_out = 0;
972 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
974 event_loop_once(ctdb->ev);
979 called when an election times out (ends)
981 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
982 struct timeval t, void *p)
984 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
985 rec->election_timeout = NULL;
990 wait for an election to finish. It finished election_timeout seconds after
991 the last election packet is received
993 static void ctdb_wait_election(struct ctdb_recoverd *rec)
995 struct ctdb_context *ctdb = rec->ctdb;
996 while (rec->election_timeout) {
997 event_loop_once(ctdb->ev);
1002 Update our local flags from all remote connected nodes.
1003 This is only run when we are or we belive we are the recovery master
1005 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1008 struct ctdb_context *ctdb = rec->ctdb;
1009 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1011 /* get the nodemap for all active remote nodes and verify
1012 they are the same as for this node
1014 for (j=0; j<nodemap->num; j++) {
1015 struct ctdb_node_map *remote_nodemap=NULL;
1018 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1021 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1025 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1026 mem_ctx, &remote_nodemap);
1028 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1029 nodemap->nodes[j].pnn));
1030 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1031 talloc_free(mem_ctx);
1032 return MONITOR_FAILED;
1034 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1035 struct ctdb_node_flag_change c;
1038 /* We should tell our daemon about this so it
1039 updates its flags or else we will log the same
1040 message again in the next iteration of recovery.
1041 Since we are the recovery master we can just as
1042 well update the flags on all nodes.
1044 c.pnn = nodemap->nodes[j].pnn;
1045 c.old_flags = nodemap->nodes[j].flags;
1046 c.new_flags = remote_nodemap->nodes[j].flags;
1048 data.dptr = (uint8_t *)&c;
1049 data.dsize = sizeof(c);
1051 ctdb_send_message(ctdb, ctdb->pnn,
1052 CTDB_SRVID_NODE_FLAGS_CHANGED,
1055 /* Update our local copy of the flags in the recovery
1058 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1059 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1060 nodemap->nodes[j].flags));
1061 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1063 /* If the BANNED flag has changed for the node
1064 this is a good reason to do a new election.
1066 if ((c.old_flags ^ c.new_flags) & NODE_FLAGS_BANNED) {
1067 DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
1068 nodemap->nodes[j].pnn, c.new_flags,
1070 talloc_free(mem_ctx);
1071 return MONITOR_ELECTION_NEEDED;
1075 talloc_free(remote_nodemap);
1077 talloc_free(mem_ctx);
1082 /* Create a new random generation ip.
1083 The generation id can not be the INVALID_GENERATION id
1085 static uint32_t new_generation(void)
1087 uint32_t generation;
1090 generation = random();
1092 if (generation != INVALID_GENERATION) {
1102 create a temporary working database
1104 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1107 struct tdb_wrap *recdb;
1110 /* open up the temporary recovery database */
1111 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1117 tdb_flags = TDB_NOLOCK;
1118 if (!ctdb->do_setsched) {
1119 tdb_flags |= TDB_NOMMAP;
1122 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1123 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1124 if (recdb == NULL) {
1125 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1135 a traverse function for pulling all relevent records from recdb
1138 struct ctdb_context *ctdb;
1139 struct ctdb_marshall_buffer *recdata;
1144 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1146 struct recdb_data *params = (struct recdb_data *)p;
1147 struct ctdb_rec_data *rec;
1148 struct ctdb_ltdb_header *hdr;
1150 /* skip empty records */
1151 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1155 /* update the dmaster field to point to us */
1156 hdr = (struct ctdb_ltdb_header *)data.dptr;
1157 hdr->dmaster = params->ctdb->pnn;
1159 /* add the record to the blob ready to send to the nodes */
1160 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1162 params->failed = true;
1165 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1166 if (params->recdata == NULL) {
1167 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1168 rec->length + params->len, params->recdata->count));
1169 params->failed = true;
1172 params->recdata->count++;
1173 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1174 params->len += rec->length;
1181 push the recdb database out to all nodes
1183 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1184 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1186 struct recdb_data params;
1187 struct ctdb_marshall_buffer *recdata;
1189 TALLOC_CTX *tmp_ctx;
1192 tmp_ctx = talloc_new(ctdb);
1193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1195 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1196 CTDB_NO_MEMORY(ctdb, recdata);
1198 recdata->db_id = dbid;
1201 params.recdata = recdata;
1202 params.len = offsetof(struct ctdb_marshall_buffer, data);
1203 params.failed = false;
1205 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1206 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1207 talloc_free(params.recdata);
1208 talloc_free(tmp_ctx);
1212 if (params.failed) {
1213 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1214 talloc_free(params.recdata);
1215 talloc_free(tmp_ctx);
1219 recdata = params.recdata;
1221 outdata.dptr = (void *)recdata;
1222 outdata.dsize = params.len;
1224 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1225 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1227 CONTROL_TIMEOUT(), false, outdata,
1230 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1231 talloc_free(recdata);
1232 talloc_free(tmp_ctx);
1236 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1237 dbid, recdata->count));
1239 talloc_free(recdata);
1240 talloc_free(tmp_ctx);
1247 go through a full recovery on one database
1249 static int recover_database(struct ctdb_recoverd *rec,
1250 TALLOC_CTX *mem_ctx,
1253 struct ctdb_node_map *nodemap,
1254 uint32_t transaction_id)
1256 struct tdb_wrap *recdb;
1258 struct ctdb_context *ctdb = rec->ctdb;
1260 struct ctdb_control_wipe_database w;
1263 recdb = create_recdb(ctdb, mem_ctx);
1264 if (recdb == NULL) {
1268 /* pull all remote databases onto the recdb */
1269 ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
1271 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1275 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1277 /* wipe all the remote databases. This is safe as we are in a transaction */
1279 w.transaction_id = transaction_id;
1281 data.dptr = (void *)&w;
1282 data.dsize = sizeof(w);
1284 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1285 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1287 CONTROL_TIMEOUT(), false, data,
1290 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1295 /* push out the correct database. This sets the dmaster and skips
1296 the empty records */
1297 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1303 /* all done with this database */
1311 we are the recmaster, and recovery is needed - start a recovery run
1313 static int do_recovery(struct ctdb_recoverd *rec,
1314 TALLOC_CTX *mem_ctx, uint32_t pnn,
1315 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
1318 struct ctdb_context *ctdb = rec->ctdb;
1320 uint32_t generation;
1321 struct ctdb_dbid_map *dbmap;
1325 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1327 /* if recovery fails, force it again */
1328 rec->need_recovery = true;
1330 if (culprit != -1) {
1331 ctdb_set_culprit(rec, culprit);
1334 if (rec->culprit_counter > 2*nodemap->num) {
1335 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
1336 culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1337 ctdb->tunable.recovery_ban_period));
1338 ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
1341 if (!ctdb_recovery_lock(ctdb, true)) {
1342 ctdb_set_culprit(rec, pnn);
1343 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1347 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
1349 /* get a list of all databases */
1350 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1352 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1356 /* we do the db creation before we set the recovery mode, so the freeze happens
1357 on all databases we will be dealing with. */
1359 /* verify that we have all the databases any other node has */
1360 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1362 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1366 /* verify that all other nodes have all our databases */
1367 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1369 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1373 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1376 /* set recovery mode to active on all nodes */
1377 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1379 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1383 /* execute the "startrecovery" event script on all nodes */
1384 ret = run_startrecovery_eventscript(rec, nodemap);
1386 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1390 /* pick a new generation number */
1391 generation = new_generation();
1393 /* change the vnnmap on this node to use the new generation
1394 number but not on any other nodes.
1395 this guarantees that if we abort the recovery prematurely
1396 for some reason (a node stops responding?)
1397 that we can just return immediately and we will reenter
1398 recovery shortly again.
1399 I.e. we deliberately leave the cluster with an inconsistent
1400 generation id to allow us to abort recovery at any stage and
1401 just restart it from scratch.
1403 vnnmap->generation = generation;
1404 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1406 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1410 data.dptr = (void *)&generation;
1411 data.dsize = sizeof(uint32_t);
1413 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1414 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1416 CONTROL_TIMEOUT(), false, data,
1419 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1423 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1425 for (i=0;i<dbmap->num;i++) {
1426 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1427 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1432 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1434 /* commit all the changes */
1435 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1437 CONTROL_TIMEOUT(), false, data,
1440 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1444 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1447 /* update the capabilities for all nodes */
1448 ret = update_capabilities(ctdb, nodemap);
1450 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1454 /* build a new vnn map with all the currently active and
1456 generation = new_generation();
1457 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1458 CTDB_NO_MEMORY(ctdb, vnnmap);
1459 vnnmap->generation = generation;
1461 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1462 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1463 for (i=j=0;i<nodemap->num;i++) {
1464 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1467 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1468 /* this node can not be an lmaster */
1469 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1474 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1475 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1476 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1479 if (vnnmap->size == 0) {
1480 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1482 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1483 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1484 vnnmap->map[0] = pnn;
1487 /* update to the new vnnmap on all nodes */
1488 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1490 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1494 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1496 /* update recmaster to point to us for all nodes */
1497 ret = set_recovery_master(ctdb, nodemap, pnn);
1499 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1503 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1506 update all nodes to have the same flags that we have
1508 ret = update_flags_on_all_nodes(ctdb, nodemap);
1510 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
1514 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1516 /* disable recovery mode */
1517 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1519 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1523 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1526 tell nodes to takeover their public IPs
1528 rec->need_takeover_run = false;
1529 ret = ctdb_takeover_run(ctdb, nodemap);
1531 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1534 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1536 /* execute the "recovered" event script on all nodes */
1537 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1539 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1543 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1545 /* send a message to all clients telling them that the cluster
1546 has been reconfigured */
1547 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1549 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1551 rec->need_recovery = false;
1553 /* We just finished a recovery successfully.
1554 We now wait for rerecovery_timeout before we allow
1555 another recovery to take place.
1557 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1558 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1559 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1566 elections are won by first checking the number of connected nodes, then
1567 the priority time, then the pnn
1569 struct election_message {
1570 uint32_t num_connected;
1571 struct timeval priority_time;
1573 uint32_t node_flags;
1577 form this nodes election data
1579 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1582 struct ctdb_node_map *nodemap;
1583 struct ctdb_context *ctdb = rec->ctdb;
1587 em->pnn = rec->ctdb->pnn;
1588 em->priority_time = rec->priority_time;
1589 em->node_flags = rec->node_flags;
1591 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1593 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1597 for (i=0;i<nodemap->num;i++) {
1598 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1599 em->num_connected++;
1603 /* we shouldnt try to win this election if we cant be a recmaster */
1604 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1605 em->num_connected = 0;
1606 em->priority_time = timeval_current();
1609 talloc_free(nodemap);
1613 see if the given election data wins
1615 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1617 struct election_message myem;
1620 ctdb_election_data(rec, &myem);
1622 /* we cant win if we dont have the recmaster capability */
1623 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1627 /* we cant win if we are banned */
1628 if (rec->node_flags & NODE_FLAGS_BANNED) {
1632 /* we will automatically win if the other node is banned */
1633 if (em->node_flags & NODE_FLAGS_BANNED) {
1637 /* try to use the most connected node */
1639 cmp = (int)myem.num_connected - (int)em->num_connected;
1642 /* then the longest running node */
1644 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1648 cmp = (int)myem.pnn - (int)em->pnn;
1655 send out an election request
1657 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1660 TDB_DATA election_data;
1661 struct election_message emsg;
1663 struct ctdb_context *ctdb = rec->ctdb;
1665 srvid = CTDB_SRVID_RECOVERY;
1667 ctdb_election_data(rec, &emsg);
1669 election_data.dsize = sizeof(struct election_message);
1670 election_data.dptr = (unsigned char *)&emsg;
1673 /* send an election message to all active nodes */
1674 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1677 /* A new node that is already frozen has entered the cluster.
1678 The existing nodes are not frozen and dont need to be frozen
1679 until the election has ended and we start the actual recovery
1681 if (update_recmaster == true) {
1682 /* first we assume we will win the election and set
1683 recoverymaster to be ourself on the current node
1685 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1687 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1697 this function will unban all nodes in the cluster
1699 static void unban_all_nodes(struct ctdb_context *ctdb)
1702 struct ctdb_node_map *nodemap;
1703 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1705 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1707 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1711 for (i=0;i<nodemap->num;i++) {
1712 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1713 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1714 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1718 talloc_free(tmp_ctx);
1723 we think we are winning the election - send a broadcast election request
1725 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1727 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1730 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1732 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1735 talloc_free(rec->send_election_te);
1736 rec->send_election_te = NULL;
1740 handler for memory dumps
1742 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1743 TDB_DATA data, void *private_data)
1745 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1748 struct rd_memdump_reply *rd;
1750 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1751 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1754 rd = (struct rd_memdump_reply *)data.dptr;
1756 dump = talloc_zero(tmp_ctx, TDB_DATA);
1758 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1759 talloc_free(tmp_ctx);
1762 ret = ctdb_dump_memory(ctdb, dump);
1764 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1765 talloc_free(tmp_ctx);
1769 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1771 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1773 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1777 talloc_free(tmp_ctx);
1781 handler for recovery master elections
1783 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1784 TDB_DATA data, void *private_data)
1786 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1788 struct election_message *em = (struct election_message *)data.dptr;
1789 TALLOC_CTX *mem_ctx;
1791 /* we got an election packet - update the timeout for the election */
1792 talloc_free(rec->election_timeout);
1793 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1794 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1795 ctdb_election_timeout, rec);
1797 mem_ctx = talloc_new(ctdb);
1799 /* someone called an election. check their election data
1800 and if we disagree and we would rather be the elected node,
1801 send a new election message to all other nodes
1803 if (ctdb_election_win(rec, em)) {
1804 if (!rec->send_election_te) {
1805 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1806 timeval_current_ofs(0, 500000),
1807 election_send_request, rec);
1809 talloc_free(mem_ctx);
1810 /*unban_all_nodes(ctdb);*/
1815 talloc_free(rec->send_election_te);
1816 rec->send_election_te = NULL;
1818 /* release the recmaster lock */
1819 if (em->pnn != ctdb->pnn &&
1820 ctdb->recovery_lock_fd != -1) {
1821 close(ctdb->recovery_lock_fd);
1822 ctdb->recovery_lock_fd = -1;
1823 unban_all_nodes(ctdb);
1826 /* ok, let that guy become recmaster then */
1827 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1829 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1830 talloc_free(mem_ctx);
1834 /* release any bans */
1835 rec->last_culprit = (uint32_t)-1;
1836 talloc_free(rec->banned_nodes);
1837 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1838 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1840 talloc_free(mem_ctx);
1846 force the start of the election process
1848 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1849 struct ctdb_node_map *nodemap)
1852 struct ctdb_context *ctdb = rec->ctdb;
1854 /* set all nodes to recovery mode to stop all internode traffic */
1855 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1857 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1861 talloc_free(rec->election_timeout);
1862 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1863 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1864 ctdb_election_timeout, rec);
1866 ret = send_election_request(rec, pnn, true);
1868 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1872 /* wait for a few seconds to collect all responses */
1873 ctdb_wait_election(rec);
1879 handler for when a node changes its flags
1881 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1882 TDB_DATA data, void *private_data)
1885 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1886 struct ctdb_node_map *nodemap=NULL;
1887 TALLOC_CTX *tmp_ctx;
1888 uint32_t changed_flags;
1890 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1892 if (data.dsize != sizeof(*c)) {
1893 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1897 tmp_ctx = talloc_new(ctdb);
1898 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1900 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1902 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1903 talloc_free(tmp_ctx);
1908 for (i=0;i<nodemap->num;i++) {
1909 if (nodemap->nodes[i].pnn == c->pnn) break;
1912 if (i == nodemap->num) {
1913 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1914 talloc_free(tmp_ctx);
1918 changed_flags = c->old_flags ^ c->new_flags;
1920 /* Dont let messages from remote nodes change the DISCONNECTED flag.
1921 This flag is handled locally based on whether the local node
1922 can communicate with the node or not.
1924 c->new_flags &= ~NODE_FLAGS_DISCONNECTED;
1925 if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) {
1926 c->new_flags |= NODE_FLAGS_DISCONNECTED;
1929 if (nodemap->nodes[i].flags != c->new_flags) {
1930 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1933 nodemap->nodes[i].flags = c->new_flags;
1935 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1936 CTDB_CURRENT_NODE, &ctdb->recovery_master);
1939 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
1940 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
1944 ctdb->recovery_master == ctdb->pnn &&
1945 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
1946 /* Only do the takeover run if the perm disabled or unhealthy
1947 flags changed since these will cause an ip failover but not
1949 If the node became disconnected or banned this will also
1950 lead to an ip address failover but that is handled
1953 if (changed_flags & NODE_FLAGS_DISABLED) {
1954 rec->need_takeover_run = true;
1958 talloc_free(tmp_ctx);
1963 struct verify_recmode_normal_data {
1965 enum monitor_result status;
1968 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1970 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1973 /* one more node has responded with recmode data*/
1976 /* if we failed to get the recmode, then return an error and let
1977 the main loop try again.
1979 if (state->state != CTDB_CONTROL_DONE) {
1980 if (rmdata->status == MONITOR_OK) {
1981 rmdata->status = MONITOR_FAILED;
1986 /* if we got a response, then the recmode will be stored in the
1989 if (state->status != CTDB_RECOVERY_NORMAL) {
1990 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
1991 rmdata->status = MONITOR_RECOVERY_NEEDED;
1998 /* verify that all nodes are in normal recovery mode */
1999 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2001 struct verify_recmode_normal_data *rmdata;
2002 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2003 struct ctdb_client_control_state *state;
2004 enum monitor_result status;
2007 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2008 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2010 rmdata->status = MONITOR_OK;
2012 /* loop over all active nodes and send an async getrecmode call to
2014 for (j=0; j<nodemap->num; j++) {
2015 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2018 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2020 nodemap->nodes[j].pnn);
2021 if (state == NULL) {
2022 /* we failed to send the control, treat this as
2023 an error and try again next iteration
2025 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2026 talloc_free(mem_ctx);
2027 return MONITOR_FAILED;
2030 /* set up the callback functions */
2031 state->async.fn = verify_recmode_normal_callback;
2032 state->async.private_data = rmdata;
2034 /* one more control to wait for to complete */
2039 /* now wait for up to the maximum number of seconds allowed
2040 or until all nodes we expect a response from has replied
2042 while (rmdata->count > 0) {
2043 event_loop_once(ctdb->ev);
2046 status = rmdata->status;
2047 talloc_free(mem_ctx);
2052 struct verify_recmaster_data {
2053 struct ctdb_recoverd *rec;
2056 enum monitor_result status;
2059 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2061 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2064 /* one more node has responded with recmaster data*/
2067 /* if we failed to get the recmaster, then return an error and let
2068 the main loop try again.
2070 if (state->state != CTDB_CONTROL_DONE) {
2071 if (rmdata->status == MONITOR_OK) {
2072 rmdata->status = MONITOR_FAILED;
2077 /* if we got a response, then the recmaster will be stored in the
2080 if (state->status != rmdata->pnn) {
2081 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2082 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2083 rmdata->status = MONITOR_ELECTION_NEEDED;
2090 /* verify that all nodes agree that we are the recmaster */
2091 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2093 struct ctdb_context *ctdb = rec->ctdb;
2094 struct verify_recmaster_data *rmdata;
2095 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2096 struct ctdb_client_control_state *state;
2097 enum monitor_result status;
2100 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2101 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2105 rmdata->status = MONITOR_OK;
2107 /* loop over all active nodes and send an async getrecmaster call to
2109 for (j=0; j<nodemap->num; j++) {
2110 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2113 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2115 nodemap->nodes[j].pnn);
2116 if (state == NULL) {
2117 /* we failed to send the control, treat this as
2118 an error and try again next iteration
2120 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2121 talloc_free(mem_ctx);
2122 return MONITOR_FAILED;
2125 /* set up the callback functions */
2126 state->async.fn = verify_recmaster_callback;
2127 state->async.private_data = rmdata;
2129 /* one more control to wait for to complete */
2134 /* now wait for up to the maximum number of seconds allowed
2135 or until all nodes we expect a response from has replied
2137 while (rmdata->count > 0) {
2138 event_loop_once(ctdb->ev);
2141 status = rmdata->status;
2142 talloc_free(mem_ctx);
2147 this function writes the number of connected nodes we have for this pnn
2148 to the pnn slot in the reclock file
2151 ctdb_recoverd_write_pnn_connect_count(struct ctdb_recoverd *rec)
2153 const char count = rec->num_connected;
2154 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
2156 if (rec->rec_file_fd == -1) {
2157 DEBUG(DEBUG_CRIT,(__location__ " Unable to write pnn count. pnnfile is not open.\n"));
2161 if (pwrite(rec->rec_file_fd, &count, 1, ctdb->pnn) == -1) {
2162 DEBUG(DEBUG_CRIT, (__location__ " Failed to write pnn count\n"));
2163 close(rec->rec_file_fd);
2164 rec->rec_file_fd = -1;
2169 this function opens the reclock file and sets a byterage lock for the single
2170 byte at position pnn+1.
2171 the existence/non-existence of such a lock provides an alternative mechanism
2172 to know whether a remote node(recovery daemon) is running or not.
2175 ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
2177 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
2179 char *pnnfile = NULL;
2181 DEBUG(DEBUG_INFO, ("Setting PNN lock for pnn:%d\n", ctdb->pnn));
2183 if (rec->rec_file_fd != -1) {
2184 close(rec->rec_file_fd);
2185 rec->rec_file_fd = -1;
2188 pnnfile = talloc_asprintf(rec, "%s.pnn", ctdb->recovery_lock_file);
2189 CTDB_NO_MEMORY_FATAL(ctdb, pnnfile);
2191 rec->rec_file_fd = open(pnnfile, O_RDWR|O_CREAT, 0600);
2192 if (rec->rec_file_fd == -1) {
2193 DEBUG(DEBUG_CRIT,(__location__ " Unable to open %s - (%s)\n",
2194 pnnfile, strerror(errno)));
2195 talloc_free(pnnfile);
2199 set_close_on_exec(rec->rec_file_fd);
2200 lock.l_type = F_WRLCK;
2201 lock.l_whence = SEEK_SET;
2202 lock.l_start = ctdb->pnn;
2206 if (fcntl(rec->rec_file_fd, F_SETLK, &lock) != 0) {
2207 close(rec->rec_file_fd);
2208 rec->rec_file_fd = -1;
2209 DEBUG(DEBUG_CRIT,(__location__ " Failed to get pnn lock on '%s'\n", pnnfile));
2210 talloc_free(pnnfile);
2215 DEBUG(DEBUG_NOTICE,(__location__ " Got pnn lock on '%s'\n", pnnfile));
2216 talloc_free(pnnfile);
2218 /* we start out with 0 connected nodes */
2219 ctdb_recoverd_write_pnn_connect_count(rec);
2223 called when we need to do the periodical reclock pnn count update
2225 static void ctdb_update_pnn_count(struct event_context *ev, struct timed_event *te,
2226 struct timeval t, void *p)
2229 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2230 struct ctdb_context *ctdb = rec->ctdb;
2231 struct ctdb_node_map *nodemap = rec->nodemap;
2233 /* close and reopen the pnn lock file */
2234 ctdb_recoverd_get_pnn_lock(rec);
2236 ctdb_recoverd_write_pnn_connect_count(rec);
2238 event_add_timed(rec->ctdb->ev, rec->ctdb,
2239 timeval_current_ofs(ctdb->tunable.reclock_ping_period, 0),
2240 ctdb_update_pnn_count, rec);
2242 /* check if there is a split cluster and yeld the recmaster role
2243 it the other half of the cluster is larger
2245 DEBUG(DEBUG_DEBUG, ("CHECK FOR SPLIT CLUSTER\n"));
2246 if (rec->nodemap == NULL) {
2249 if (rec->rec_file_fd == -1) {
2252 /* only test this if we think we are the recmaster */
2253 if (ctdb->pnn != rec->recmaster) {
2254 DEBUG(DEBUG_DEBUG, ("We are not recmaster, skip test\n"));
2257 if (ctdb->recovery_lock_fd == -1) {
2258 DEBUG(DEBUG_ERR, (__location__ " Lost reclock pnn file. Yielding recmaster role\n"));
2259 close(ctdb->recovery_lock_fd);
2260 ctdb->recovery_lock_fd = -1;
2261 force_election(rec, ctdb->pnn, rec->nodemap);
2264 for (i=0; i<nodemap->num; i++) {
2265 /* we dont need to check ourself */
2266 if (nodemap->nodes[i].pnn == ctdb->pnn) {
2269 /* dont check nodes that are connected to us */
2270 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2273 /* check if the node is "connected" and how connected it it */
2274 count = ctdb_read_pnn_lock(rec->rec_file_fd, nodemap->nodes[i].pnn);
2278 /* check if that node is more connected that us */
2279 if (count > rec->num_connected) {
2280 DEBUG(DEBUG_ERR, ("DISCONNECTED Node %u is more connected than we are, yielding recmaster role\n", nodemap->nodes[i].pnn));
2281 close(ctdb->recovery_lock_fd);
2282 ctdb->recovery_lock_fd = -1;
2283 force_election(rec, ctdb->pnn, rec->nodemap);
2289 /* called to check that the allocation of public ip addresses is ok.
2291 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2293 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2294 struct ctdb_all_public_ips *ips = NULL;
2295 struct ctdb_uptime *uptime1 = NULL;
2296 struct ctdb_uptime *uptime2 = NULL;
2299 ret = ctdb_ctrl_uptime(ctdb, ctdb, CONTROL_TIMEOUT(),
2300 CTDB_CURRENT_NODE, &uptime1);
2302 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2303 talloc_free(mem_ctx);
2307 /* read the ip allocation from the local node */
2308 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2310 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2311 talloc_free(mem_ctx);
2315 ret = ctdb_ctrl_uptime(ctdb, ctdb, CONTROL_TIMEOUT(),
2316 CTDB_CURRENT_NODE, &uptime2);
2318 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2319 talloc_free(mem_ctx);
2323 /* skip the check if the startrecovery time has changed */
2324 if (timeval_compare(&uptime1->last_recovery_started,
2325 &uptime2->last_recovery_started) != 0) {
2326 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2330 /* skip the check if the endrecovery time has changed */
2331 if (timeval_compare(&uptime1->last_recovery_finished,
2332 &uptime2->last_recovery_finished) != 0) {
2333 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2337 /* skip the check if we have started but not finished recovery */
2338 if (timeval_compare(&uptime1->last_recovery_finished,
2339 &uptime1->last_recovery_started) != 1) {
2340 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2345 /* verify that we have the ip addresses we should have
2346 and we dont have ones we shouldnt have.
2347 if we find an inconsistency we set recmode to
2348 active on the local node and wait for the recmaster
2349 to do a full blown recovery
2351 for (j=0; j<ips->num; j++) {
2352 if (ips->ips[j].pnn == pnn) {
2353 if (!ctdb_sys_have_ip(ips->ips[j].sin)) {
2354 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
2355 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2357 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2359 talloc_free(mem_ctx);
2362 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2364 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2366 talloc_free(mem_ctx);
2371 if (ctdb_sys_have_ip(ips->ips[j].sin)) {
2372 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", inet_ntoa(ips->ips[j].sin.sin_addr)));
2373 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2375 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2377 talloc_free(mem_ctx);
2380 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2382 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2384 talloc_free(mem_ctx);
2391 talloc_free(mem_ctx);
2396 the main monitoring loop
2398 static void monitor_cluster(struct ctdb_context *ctdb)
2401 TALLOC_CTX *mem_ctx=NULL;
2402 struct ctdb_node_map *nodemap=NULL;
2403 struct ctdb_node_map *remote_nodemap=NULL;
2404 struct ctdb_vnn_map *vnnmap=NULL;
2405 struct ctdb_vnn_map *remote_vnnmap=NULL;
2406 int32_t debug_level;
2408 struct ctdb_recoverd *rec;
2411 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2413 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2414 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2417 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
2418 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
2420 rec->priority_time = timeval_current();
2422 /* open the rec file fd and lock our slot */
2423 rec->rec_file_fd = -1;
2424 ctdb_recoverd_get_pnn_lock(rec);
2426 /* register a message port for sending memory dumps */
2427 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2429 /* register a message port for recovery elections */
2430 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2432 /* and one for when nodes are disabled/enabled */
2433 ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec);
2435 /* and one for when nodes are banned */
2436 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
2438 /* and one for when nodes are unbanned */
2439 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
2441 /* register a message port for vacuum fetch */
2442 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2444 /* update the reclock pnn file connected count on a regular basis */
2445 event_add_timed(ctdb->ev, ctdb,
2446 timeval_current_ofs(ctdb->tunable.reclock_ping_period, 0),
2447 ctdb_update_pnn_count, rec);
2451 talloc_free(mem_ctx);
2454 mem_ctx = talloc_new(ctdb);
2456 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2460 /* we only check for recovery once every second */
2461 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2463 /* verify that the main daemon is still running */
2464 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2465 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2469 if (rec->election_timeout) {
2470 /* an election is in progress */
2474 /* read the debug level from the parent and update locally */
2475 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2477 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2480 LogLevel = debug_level;
2483 /* We must check if we need to ban a node here but we want to do this
2484 as early as possible so we dont wait until we have pulled the node
2485 map from the local node. thats why we have the hardcoded value 20
2487 if (rec->culprit_counter > 20) {
2488 DEBUG(DEBUG_NOTICE,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
2489 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
2490 ctdb->tunable.recovery_ban_period));
2491 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
2494 /* get relevant tunables */
2495 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2497 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2501 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2502 if (pnn == (uint32_t)-1) {
2503 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2507 /* get the vnnmap */
2508 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2510 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2515 /* get number of nodes */
2517 talloc_free(rec->nodemap);
2518 rec->nodemap = NULL;
2521 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2523 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2526 nodemap = rec->nodemap;
2528 /* check which node is the recovery master */
2529 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2531 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2535 if (rec->recmaster == (uint32_t)-1) {
2536 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2537 force_election(rec, pnn, nodemap);
2541 /* check that we (recovery daemon) and the local ctdb daemon
2542 agrees on whether we are banned or not
2544 if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
2545 if (rec->banned_nodes[pnn] == NULL) {
2546 if (rec->recmaster == pnn) {
2547 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2549 ctdb_unban_node(rec, pnn);
2551 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2552 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2553 ctdb_set_culprit(rec, pnn);
2558 if (rec->banned_nodes[pnn] != NULL) {
2559 if (rec->recmaster == pnn) {
2560 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2562 ctdb_unban_node(rec, pnn);
2564 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2566 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2567 ctdb_set_culprit(rec, pnn);
2573 /* remember our own node flags */
2574 rec->node_flags = nodemap->nodes[pnn].flags;
2576 /* count how many active nodes there are */
2577 rec->num_active = 0;
2578 rec->num_connected = 0;
2579 for (i=0; i<nodemap->num; i++) {
2580 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2583 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2584 rec->num_connected++;
2589 /* verify that the recmaster node is still active */
2590 for (j=0; j<nodemap->num; j++) {
2591 if (nodemap->nodes[j].pnn==rec->recmaster) {
2596 if (j == nodemap->num) {
2597 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2598 force_election(rec, pnn, nodemap);
2602 /* if recovery master is disconnected we must elect a new recmaster */
2603 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2604 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2605 force_election(rec, pnn, nodemap);
2609 /* grap the nodemap from the recovery master to check if it is banned */
2610 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2611 mem_ctx, &remote_nodemap);
2613 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2614 nodemap->nodes[j].pnn));
2619 if (remote_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2620 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2621 force_election(rec, pnn, nodemap);
2626 /* verify that we and the recmaster agrees on our flags */
2627 if (nodemap->nodes[pnn].flags != remote_nodemap->nodes[pnn].flags) {
2628 DEBUG(DEBUG_ERR, (__location__ " Recmaster disagrees on our flags flags:0x%x recmaster_flags:0x%x Broadcasting out flags.\n", nodemap->nodes[pnn].flags, remote_nodemap->nodes[pnn].flags));
2630 update_our_flags_on_all_nodes(ctdb, pnn, nodemap);
2634 /* verify that we have all ip addresses we should have and we dont
2635 * have addresses we shouldnt have.
2637 if (verify_ip_allocation(ctdb, pnn) != 0) {
2638 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2643 /* if we are not the recmaster then we do not need to check
2644 if recovery is needed
2646 if (pnn != rec->recmaster) {
2651 /* ensure our local copies of flags are right */
2652 ret = update_local_flags(rec, nodemap);
2653 if (ret == MONITOR_ELECTION_NEEDED) {
2654 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2655 force_election(rec, pnn, nodemap);
2658 if (ret != MONITOR_OK) {
2659 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2663 /* update the list of public ips that a node can handle for
2666 for (j=0; j<nodemap->num; j++) {
2667 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2670 /* release any existing data */
2671 if (ctdb->nodes[j]->public_ips) {
2672 talloc_free(ctdb->nodes[j]->public_ips);
2673 ctdb->nodes[j]->public_ips = NULL;
2675 /* grab a new shiny list of public ips from the node */
2676 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2677 ctdb->nodes[j]->pnn,
2679 &ctdb->nodes[j]->public_ips)) {
2680 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2681 ctdb->nodes[j]->pnn));
2687 /* verify that all active nodes agree that we are the recmaster */
2688 switch (verify_recmaster(rec, nodemap, pnn)) {
2689 case MONITOR_RECOVERY_NEEDED:
2690 /* can not happen */
2692 case MONITOR_ELECTION_NEEDED:
2693 force_election(rec, pnn, nodemap);
2697 case MONITOR_FAILED:
2702 if (rec->need_recovery) {
2703 /* a previous recovery didn't finish */
2704 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, -1);
2708 /* verify that all active nodes are in normal mode
2709 and not in recovery mode
2711 switch (verify_recmode(ctdb, nodemap)) {
2712 case MONITOR_RECOVERY_NEEDED:
2713 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2715 case MONITOR_FAILED:
2717 case MONITOR_ELECTION_NEEDED:
2718 /* can not happen */
2724 /* we should have the reclock - check its not stale */
2725 if (ctdb->recovery_lock_fd == -1) {
2726 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2727 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2731 if (pread(ctdb->recovery_lock_fd, &c, 1, 0) == -1) {
2732 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2733 close(ctdb->recovery_lock_fd);
2734 ctdb->recovery_lock_fd = -1;
2735 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2739 /* get the nodemap for all active remote nodes and verify
2740 they are the same as for this node
2742 for (j=0; j<nodemap->num; j++) {
2743 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2746 if (nodemap->nodes[j].pnn == pnn) {
2750 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2751 mem_ctx, &remote_nodemap);
2753 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
2754 nodemap->nodes[j].pnn));
2758 /* if the nodes disagree on how many nodes there are
2759 then this is a good reason to try recovery
2761 if (remote_nodemap->num != nodemap->num) {
2762 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2763 nodemap->nodes[j].pnn, remote_nodemap->num, nodemap->num));
2764 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2768 /* if the nodes disagree on which nodes exist and are
2769 active, then that is also a good reason to do recovery
2771 for (i=0;i<nodemap->num;i++) {
2772 if (remote_nodemap->nodes[i].pnn != nodemap->nodes[i].pnn) {
2773 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2774 nodemap->nodes[j].pnn, i,
2775 remote_nodemap->nodes[i].pnn, nodemap->nodes[i].pnn));
2776 do_recovery(rec, mem_ctx, pnn, nodemap,
2777 vnnmap, nodemap->nodes[j].pnn);
2780 if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) !=
2781 (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2782 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n",
2783 nodemap->nodes[j].pnn, i,
2784 remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags));
2785 do_recovery(rec, mem_ctx, pnn, nodemap,
2786 vnnmap, nodemap->nodes[j].pnn);
2794 /* there better be the same number of lmasters in the vnn map
2795 as there are active nodes or we will have to do a recovery
2797 if (vnnmap->size != rec->num_active) {
2798 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
2799 vnnmap->size, rec->num_active));
2800 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2804 /* verify that all active nodes in the nodemap also exist in
2807 for (j=0; j<nodemap->num; j++) {
2808 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2811 if (nodemap->nodes[j].pnn == pnn) {
2815 for (i=0; i<vnnmap->size; i++) {
2816 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2820 if (i == vnnmap->size) {
2821 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2822 nodemap->nodes[j].pnn));
2823 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2829 /* verify that all other nodes have the same vnnmap
2830 and are from the same generation
2832 for (j=0; j<nodemap->num; j++) {
2833 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2836 if (nodemap->nodes[j].pnn == pnn) {
2840 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2841 mem_ctx, &remote_vnnmap);
2843 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2844 nodemap->nodes[j].pnn));
2848 /* verify the vnnmap generation is the same */
2849 if (vnnmap->generation != remote_vnnmap->generation) {
2850 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2851 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2852 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2856 /* verify the vnnmap size is the same */
2857 if (vnnmap->size != remote_vnnmap->size) {
2858 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2859 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2860 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
2864 /* verify the vnnmap is the same */
2865 for (i=0;i<vnnmap->size;i++) {
2866 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2867 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2868 nodemap->nodes[j].pnn));
2869 do_recovery(rec, mem_ctx, pnn, nodemap,
2870 vnnmap, nodemap->nodes[j].pnn);
2876 /* we might need to change who has what IP assigned */
2877 if (rec->need_takeover_run) {
2878 rec->need_takeover_run = false;
2880 /* execute the "startrecovery" event script on all nodes */
2881 ret = run_startrecovery_eventscript(rec, nodemap);
2883 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
2884 do_recovery(rec, mem_ctx, pnn, nodemap,
2888 ret = ctdb_takeover_run(ctdb, nodemap);
2890 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
2891 do_recovery(rec, mem_ctx, pnn, nodemap,
2895 /* execute the "recovered" event script on all nodes */
2896 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
2898 // we cant check whether the event completed successfully
2899 // since this script WILL fail if the node is in recovery mode
2900 // and if that race happens, the code here would just cause a second
2901 // cascading recovery.
2903 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
2904 do_recovery(rec, mem_ctx, pnn, nodemap,
2911 DEBUG(DEBUG_DEBUG, (__location__ " Update flags on all nodes\n"));
2913 update all nodes to have the same flags that we have
2915 ret = update_flags_on_all_nodes(ctdb, nodemap);
2917 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes\n"));
2926 event handler for when the main ctdbd dies
2928 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
2929 uint16_t flags, void *private_data)
2931 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2936 called regularly to verify that the recovery daemon is still running
2938 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
2939 struct timeval yt, void *p)
2941 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
2943 if (kill(ctdb->recoverd_pid, 0) != 0) {
2944 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
2946 ctdb_stop_recoverd(ctdb);
2947 ctdb_stop_keepalive(ctdb);
2948 ctdb_stop_monitoring(ctdb);
2949 ctdb_release_all_ips(ctdb);
2950 if (ctdb->methods != NULL) {
2951 ctdb->methods->shutdown(ctdb);
2953 ctdb_event_script(ctdb, "shutdown");
2958 event_add_timed(ctdb->ev, ctdb,
2959 timeval_current_ofs(30, 0),
2960 ctdb_check_recd, ctdb);
2963 static void recd_sig_child_handler(struct event_context *ev,
2964 struct signal_event *se, int signum, int count,
2968 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2973 pid = waitpid(-1, &status, WNOHANG);
2975 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
2979 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
2985 startup the recovery daemon as a child of the main ctdb daemon
2987 int ctdb_start_recoverd(struct ctdb_context *ctdb)
2991 struct signal_event *se;
2993 if (pipe(fd) != 0) {
2997 ctdb->ctdbd_pid = getpid();
2999 ctdb->recoverd_pid = fork();
3000 if (ctdb->recoverd_pid == -1) {
3004 if (ctdb->recoverd_pid != 0) {
3006 event_add_timed(ctdb->ev, ctdb,
3007 timeval_current_ofs(30, 0),
3008 ctdb_check_recd, ctdb);
3014 /* shutdown the transport */
3015 if (ctdb->methods) {
3016 ctdb->methods->shutdown(ctdb);
3019 /* get a new event context */
3020 talloc_free(ctdb->ev);
3021 ctdb->ev = event_context_init(ctdb);
3023 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3024 ctdb_recoverd_parent, &fd[0]);
3026 close(ctdb->daemon.sd);
3027 ctdb->daemon.sd = -1;
3029 srandom(getpid() ^ time(NULL));
3031 /* the recovery daemon does not need to be realtime */
3032 if (ctdb->do_setsched) {
3033 ctdb_restore_scheduler(ctdb);
3036 /* initialise ctdb */
3037 ret = ctdb_socket_connect(ctdb);
3039 DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb\n"));
3043 /* set up a handler to pick up sigchld */
3044 se = event_add_signal(ctdb->ev, ctdb,
3046 recd_sig_child_handler,
3049 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3053 monitor_cluster(ctdb);
3055 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3060 shutdown the recovery daemon
3062 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3064 if (ctdb->recoverd_pid == 0) {
3068 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3069 kill(ctdb->recoverd_pid, SIGTERM);