4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
35 struct ctdb_recoverd *rec;
39 /* list of "ctdb ipreallocate" processes to call back when we have
40 finished the takeover run.
42 struct ip_reallocate_list {
43 struct ip_reallocate_list *next;
44 struct rd_memdump_reply *rd;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 struct ctdb_node_map *nodemap;
56 uint32_t last_culprit;
57 uint32_t culprit_counter;
58 struct timeval first_recover_time;
59 struct ban_state **banned_nodes;
60 struct timeval priority_time;
61 bool need_takeover_run;
64 struct timed_event *send_election_te;
65 struct timed_event *election_timeout;
66 struct vacuum_info *vacuum_info;
67 TALLOC_CTX *ip_reallocate_ctx;
68 struct ip_reallocate_list *reallocate_callers;
71 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
72 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
78 static void ctdb_unban_node(struct ctdb_recoverd *rec, uint32_t pnn)
80 struct ctdb_context *ctdb = rec->ctdb;
82 DEBUG(DEBUG_NOTICE,("Unbanning node %u\n", pnn));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_unban_node\n", pnn));
89 /* If we are unbanning a different node then just pass the ban info on */
90 if (pnn != ctdb->pnn) {
94 DEBUG(DEBUG_NOTICE,("Unanning remote node %u. Passing the ban request on to the remote node.\n", pnn));
96 data.dptr = (uint8_t *)&pnn;
97 data.dsize = sizeof(uint32_t);
99 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_UNBAN_NODE, data);
101 DEBUG(DEBUG_ERR,("Failed to unban node %u\n", pnn));
108 /* make sure we remember we are no longer banned in case
109 there is an election */
110 rec->node_flags &= ~NODE_FLAGS_BANNED;
112 DEBUG(DEBUG_INFO,("Clearing ban flag on node %u\n", pnn));
113 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, 0, NODE_FLAGS_BANNED);
115 if (rec->banned_nodes[pnn] == NULL) {
116 DEBUG(DEBUG_INFO,("No ban recorded for this node. ctdb_unban_node() request ignored\n"));
120 talloc_free(rec->banned_nodes[pnn]);
121 rec->banned_nodes[pnn] = NULL;
126 called when a ban has timed out
128 static void ctdb_ban_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
130 struct ban_state *state = talloc_get_type(p, struct ban_state);
131 struct ctdb_recoverd *rec = state->rec;
132 uint32_t pnn = state->banned_node;
134 DEBUG(DEBUG_NOTICE,("Ban timeout. Node %u is now unbanned\n", pnn));
135 ctdb_unban_node(rec, pnn);
139 ban a node for a period of time
141 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
143 struct ctdb_context *ctdb = rec->ctdb;
145 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
147 if (!ctdb_validate_pnn(ctdb, pnn)) {
148 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
152 if (0 == ctdb->tunable.enable_bans) {
153 DEBUG(DEBUG_INFO,("Bans are disabled - ignoring ban of node %u\n", pnn));
157 /* If we are banning a different node then just pass the ban info on */
158 if (pnn != ctdb->pnn) {
159 struct ctdb_ban_info b;
163 DEBUG(DEBUG_NOTICE,("Banning remote node %u for %u seconds. Passing the ban request on to the remote node.\n", pnn, ban_time));
166 b.ban_time = ban_time;
168 data.dptr = (uint8_t *)&b;
169 data.dsize = sizeof(b);
171 ret = ctdb_send_message(ctdb, pnn, CTDB_SRVID_BAN_NODE, data);
173 DEBUG(DEBUG_ERR,("Failed to ban node %u\n", pnn));
180 DEBUG(DEBUG_NOTICE,("self ban - lowering our election priority\n"));
181 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, NODE_FLAGS_BANNED, 0);
183 /* banning ourselves - lower our election priority */
184 rec->priority_time = timeval_current();
186 /* make sure we remember we are banned in case there is an
188 rec->node_flags |= NODE_FLAGS_BANNED;
190 if (rec->banned_nodes[pnn] != NULL) {
191 DEBUG(DEBUG_NOTICE,("Re-banning an already banned node. Remove previous ban and set a new ban.\n"));
192 talloc_free(rec->banned_nodes[pnn]);
193 rec->banned_nodes[pnn] = NULL;
196 rec->banned_nodes[pnn] = talloc(rec->banned_nodes, struct ban_state);
197 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes[pnn]);
199 rec->banned_nodes[pnn]->rec = rec;
200 rec->banned_nodes[pnn]->banned_node = pnn;
203 event_add_timed(ctdb->ev, rec->banned_nodes[pnn],
204 timeval_current_ofs(ban_time, 0),
205 ctdb_ban_timeout, rec->banned_nodes[pnn]);
209 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
213 run the "recovered" eventscript on all nodes
215 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
220 tmp_ctx = talloc_new(ctdb);
221 CTDB_NO_MEMORY(ctdb, tmp_ctx);
223 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
224 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
226 CONTROL_TIMEOUT(), false, tdb_null,
229 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
231 talloc_free(tmp_ctx);
235 talloc_free(tmp_ctx);
240 remember the trouble maker
242 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
244 struct ctdb_context *ctdb = rec->ctdb;
246 if (rec->last_culprit != culprit ||
247 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
248 DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
249 /* either a new node is the culprit, or we've decided to forgive them */
250 rec->last_culprit = culprit;
251 rec->first_recover_time = timeval_current();
252 rec->culprit_counter = 0;
254 rec->culprit_counter++;
258 remember the trouble maker
260 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
262 struct ctdb_context *ctdb = rec->ctdb;
264 if (rec->last_culprit != culprit ||
265 timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
266 DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
267 /* either a new node is the culprit, or we've decided to forgive them */
268 rec->last_culprit = culprit;
269 rec->first_recover_time = timeval_current();
270 rec->culprit_counter = 0;
272 rec->culprit_counter += count;
275 /* this callback is called for every node that failed to execute the
278 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
280 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
282 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
284 ctdb_set_culprit(rec, node_pnn);
288 run the "startrecovery" eventscript on all nodes
290 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
294 struct ctdb_context *ctdb = rec->ctdb;
296 tmp_ctx = talloc_new(ctdb);
297 CTDB_NO_MEMORY(ctdb, tmp_ctx);
299 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
300 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
302 CONTROL_TIMEOUT(), false, tdb_null,
304 startrecovery_fail_callback,
306 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
307 talloc_free(tmp_ctx);
311 talloc_free(tmp_ctx);
315 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
317 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
318 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
321 if (node_pnn < ctdb->num_nodes) {
322 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
327 update the node capabilities for all connected nodes
329 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
334 tmp_ctx = talloc_new(ctdb);
335 CTDB_NO_MEMORY(ctdb, tmp_ctx);
337 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
338 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
339 nodes, CONTROL_TIMEOUT(),
341 async_getcap_callback, NULL,
343 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
344 talloc_free(tmp_ctx);
348 talloc_free(tmp_ctx);
353 change recovery mode on all nodes
355 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t rec_mode)
361 tmp_ctx = talloc_new(ctdb);
362 CTDB_NO_MEMORY(ctdb, tmp_ctx);
364 /* freeze all nodes */
365 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
366 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
367 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
368 nodes, CONTROL_TIMEOUT(),
372 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
373 talloc_free(tmp_ctx);
379 data.dsize = sizeof(uint32_t);
380 data.dptr = (unsigned char *)&rec_mode;
382 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
383 nodes, CONTROL_TIMEOUT(),
387 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
388 talloc_free(tmp_ctx);
392 talloc_free(tmp_ctx);
397 change recovery master on all node
399 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
405 tmp_ctx = talloc_new(ctdb);
406 CTDB_NO_MEMORY(ctdb, tmp_ctx);
408 data.dsize = sizeof(uint32_t);
409 data.dptr = (unsigned char *)&pnn;
411 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
414 CONTROL_TIMEOUT(), false, data,
417 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
418 talloc_free(tmp_ctx);
422 talloc_free(tmp_ctx);
428 ensure all other nodes have attached to any databases that we have
430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
431 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
434 struct ctdb_dbid_map *remote_dbmap;
436 /* verify that all other nodes have all our databases */
437 for (j=0; j<nodemap->num; j++) {
438 /* we dont need to ourself ourselves */
439 if (nodemap->nodes[j].pnn == pnn) {
442 /* dont check nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
447 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, &remote_dbmap);
450 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
454 /* step through all local databases */
455 for (db=0; db<dbmap->num;db++) {
459 for (i=0;i<remote_dbmap->num;i++) {
460 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
464 /* the remote node already have this database */
465 if (i!=remote_dbmap->num) {
468 /* ok so we need to create this database */
469 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
472 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
475 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
476 mem_ctx, name, dbmap->dbs[db].persistent);
478 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
489 ensure we are attached to any databases that anyone else is attached to
491 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
492 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
495 struct ctdb_dbid_map *remote_dbmap;
497 /* verify that we have all database any other node has */
498 for (j=0; j<nodemap->num; j++) {
499 /* we dont need to ourself ourselves */
500 if (nodemap->nodes[j].pnn == pnn) {
503 /* dont check nodes that are unavailable */
504 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
509 mem_ctx, &remote_dbmap);
511 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
515 /* step through all databases on the remote node */
516 for (db=0; db<remote_dbmap->num;db++) {
519 for (i=0;i<(*dbmap)->num;i++) {
520 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
524 /* we already have this db locally */
525 if (i!=(*dbmap)->num) {
528 /* ok so we need to create this database and
531 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
532 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
534 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
535 nodemap->nodes[j].pnn));
538 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
539 remote_dbmap->dbs[db].persistent);
541 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
544 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
546 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
557 pull the remote database contents from one node into the recdb
559 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
560 struct tdb_wrap *recdb, uint32_t dbid)
564 struct ctdb_marshall_buffer *reply;
565 struct ctdb_rec_data *rec;
567 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
569 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
570 CONTROL_TIMEOUT(), &outdata);
572 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
573 talloc_free(tmp_ctx);
577 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
579 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
580 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
581 talloc_free(tmp_ctx);
585 rec = (struct ctdb_rec_data *)&reply->data[0];
589 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
591 struct ctdb_ltdb_header *hdr;
594 key.dptr = &rec->data[0];
595 key.dsize = rec->keylen;
596 data.dptr = &rec->data[key.dsize];
597 data.dsize = rec->datalen;
599 hdr = (struct ctdb_ltdb_header *)data.dptr;
601 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
602 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
603 talloc_free(tmp_ctx);
607 /* fetch the existing record, if any */
608 existing = tdb_fetch(recdb->tdb, key);
610 if (existing.dptr != NULL) {
611 struct ctdb_ltdb_header header;
612 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
613 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
614 (unsigned)existing.dsize, srcnode));
616 talloc_free(tmp_ctx);
619 header = *(struct ctdb_ltdb_header *)existing.dptr;
621 if (!(header.rsn < hdr->rsn ||
622 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
627 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
628 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
629 talloc_free(tmp_ctx);
634 talloc_free(tmp_ctx);
640 pull all the remote database contents into the recdb
642 static int pull_remote_database(struct ctdb_context *ctdb,
643 struct ctdb_recoverd *rec,
644 struct ctdb_node_map *nodemap,
645 struct tdb_wrap *recdb, uint32_t dbid)
649 /* pull all records from all other nodes across onto this node
650 (this merges based on rsn)
652 for (j=0; j<nodemap->num; j++) {
653 /* dont merge from nodes that are unavailable */
654 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
657 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
658 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
659 nodemap->nodes[j].pnn));
660 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
670 update flags on all active nodes
672 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
676 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
678 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
686 ensure all nodes have the same vnnmap we do
688 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
689 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
693 /* push the new vnn map out to all the nodes */
694 for (j=0; j<nodemap->num; j++) {
695 /* dont push to nodes that are unavailable */
696 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
700 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
702 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
712 handler for when the admin bans a node
714 static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
715 TDB_DATA data, void *private_data)
717 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
718 struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
719 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
721 if (data.dsize != sizeof(*b)) {
722 DEBUG(DEBUG_ERR,("Bad data in ban_handler\n"));
723 talloc_free(mem_ctx);
727 if (b->pnn != ctdb->pnn) {
728 DEBUG(DEBUG_ERR,("Got a ban request for pnn:%u but our pnn is %u. Ignoring ban request\n", b->pnn, ctdb->pnn));
732 DEBUG(DEBUG_NOTICE,("Node %u has been banned for %u seconds\n",
733 b->pnn, b->ban_time));
735 ctdb_ban_node(rec, b->pnn, b->ban_time);
736 talloc_free(mem_ctx);
740 handler for when the admin unbans a node
742 static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
743 TDB_DATA data, void *private_data)
745 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
746 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
749 if (data.dsize != sizeof(uint32_t)) {
750 DEBUG(DEBUG_ERR,("Bad data in unban_handler\n"));
751 talloc_free(mem_ctx);
754 pnn = *(uint32_t *)data.dptr;
756 if (pnn != ctdb->pnn) {
757 DEBUG(DEBUG_ERR,("Got an unban request for pnn:%u but our pnn is %u. Ignoring unban request\n", pnn, ctdb->pnn));
761 DEBUG(DEBUG_NOTICE,("Node %u has been unbanned.\n", pnn));
762 ctdb_unban_node(rec, pnn);
763 talloc_free(mem_ctx);
768 struct vacuum_info *next, *prev;
769 struct ctdb_recoverd *rec;
771 struct ctdb_db_context *ctdb_db;
772 struct ctdb_marshall_buffer *recs;
773 struct ctdb_rec_data *r;
776 static void vacuum_fetch_next(struct vacuum_info *v);
779 called when a vacuum fetch has completed - just free it and do the next one
781 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
783 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
785 vacuum_fetch_next(v);
790 process the next element from the vacuum list
792 static void vacuum_fetch_next(struct vacuum_info *v)
794 struct ctdb_call call;
795 struct ctdb_rec_data *r;
797 while (v->recs->count) {
798 struct ctdb_client_call_state *state;
800 struct ctdb_ltdb_header *hdr;
803 call.call_id = CTDB_NULL_FUNC;
804 call.flags = CTDB_IMMEDIATE_MIGRATION;
807 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
810 call.key.dptr = &r->data[0];
811 call.key.dsize = r->keylen;
813 /* ensure we don't block this daemon - just skip a record if we can't get
815 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
819 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
820 if (data.dptr == NULL) {
821 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
825 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
827 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
831 hdr = (struct ctdb_ltdb_header *)data.dptr;
832 if (hdr->dmaster == v->rec->ctdb->pnn) {
833 /* its already local */
835 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
841 state = ctdb_call_send(v->ctdb_db, &call);
842 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
844 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
848 state->async.fn = vacuum_fetch_callback;
849 state->async.private_data = v;
858 destroy a vacuum info structure
860 static int vacuum_info_destructor(struct vacuum_info *v)
862 DLIST_REMOVE(v->rec->vacuum_info, v);
868 handler for vacuum fetch
870 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
871 TDB_DATA data, void *private_data)
873 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
874 struct ctdb_marshall_buffer *recs;
876 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
878 struct ctdb_dbid_map *dbmap=NULL;
879 bool persistent = false;
880 struct ctdb_db_context *ctdb_db;
881 struct ctdb_rec_data *r;
883 struct vacuum_info *v;
885 recs = (struct ctdb_marshall_buffer *)data.dptr;
886 r = (struct ctdb_rec_data *)&recs->data[0];
888 if (recs->count == 0) {
889 talloc_free(tmp_ctx);
895 for (v=rec->vacuum_info;v;v=v->next) {
896 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
897 /* we're already working on records from this node */
898 talloc_free(tmp_ctx);
903 /* work out if the database is persistent */
904 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
906 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
907 talloc_free(tmp_ctx);
911 for (i=0;i<dbmap->num;i++) {
912 if (dbmap->dbs[i].dbid == recs->db_id) {
913 persistent = dbmap->dbs[i].persistent;
917 if (i == dbmap->num) {
918 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
919 talloc_free(tmp_ctx);
923 /* find the name of this database */
924 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
925 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
926 talloc_free(tmp_ctx);
931 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
932 if (ctdb_db == NULL) {
933 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
934 talloc_free(tmp_ctx);
938 v = talloc_zero(rec, struct vacuum_info);
940 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
941 talloc_free(tmp_ctx);
946 v->srcnode = srcnode;
947 v->ctdb_db = ctdb_db;
948 v->recs = talloc_memdup(v, recs, data.dsize);
949 if (v->recs == NULL) {
950 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
952 talloc_free(tmp_ctx);
955 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
957 DLIST_ADD(rec->vacuum_info, v);
959 talloc_set_destructor(v, vacuum_info_destructor);
961 vacuum_fetch_next(v);
962 talloc_free(tmp_ctx);
967 called when ctdb_wait_timeout should finish
969 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
970 struct timeval yt, void *p)
972 uint32_t *timed_out = (uint32_t *)p;
977 wait for a given number of seconds
979 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
981 uint32_t timed_out = 0;
982 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
984 event_loop_once(ctdb->ev);
989 called when an election times out (ends)
991 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
992 struct timeval t, void *p)
994 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
995 rec->election_timeout = NULL;
1000 wait for an election to finish. It finished election_timeout seconds after
1001 the last election packet is received
1003 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1005 struct ctdb_context *ctdb = rec->ctdb;
1006 while (rec->election_timeout) {
1007 event_loop_once(ctdb->ev);
1012 Update our local flags from all remote connected nodes.
1013 This is only run when we are or we belive we are the recovery master
1015 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1018 struct ctdb_context *ctdb = rec->ctdb;
1019 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1021 /* get the nodemap for all active remote nodes and verify
1022 they are the same as for this node
1024 for (j=0; j<nodemap->num; j++) {
1025 struct ctdb_node_map *remote_nodemap=NULL;
1028 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1031 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1035 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1036 mem_ctx, &remote_nodemap);
1038 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1039 nodemap->nodes[j].pnn));
1040 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1041 talloc_free(mem_ctx);
1042 return MONITOR_FAILED;
1044 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1045 int ban_changed = (nodemap->nodes[j].flags ^ remote_nodemap->nodes[j].flags) & NODE_FLAGS_BANNED;
1048 DEBUG(DEBUG_NOTICE,("Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election\n",
1049 nodemap->nodes[j].pnn,
1050 remote_nodemap->nodes[j].flags,
1051 nodemap->nodes[j].flags));
1054 /* We should tell our daemon about this so it
1055 updates its flags or else we will log the same
1056 message again in the next iteration of recovery.
1057 Since we are the recovery master we can just as
1058 well update the flags on all nodes.
1060 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1062 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1066 /* Update our local copy of the flags in the recovery
1069 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1070 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1071 nodemap->nodes[j].flags));
1072 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1074 /* If the BANNED flag has changed for the node
1075 this is a good reason to do a new election.
1078 talloc_free(mem_ctx);
1079 return MONITOR_ELECTION_NEEDED;
1083 talloc_free(remote_nodemap);
1085 talloc_free(mem_ctx);
1090 /* Create a new random generation ip.
1091 The generation id can not be the INVALID_GENERATION id
1093 static uint32_t new_generation(void)
1095 uint32_t generation;
1098 generation = random();
1100 if (generation != INVALID_GENERATION) {
1110 create a temporary working database
1112 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1115 struct tdb_wrap *recdb;
1118 /* open up the temporary recovery database */
1119 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1125 tdb_flags = TDB_NOLOCK;
1126 if (!ctdb->do_setsched) {
1127 tdb_flags |= TDB_NOMMAP;
1130 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1131 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1132 if (recdb == NULL) {
1133 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1143 a traverse function for pulling all relevent records from recdb
1146 struct ctdb_context *ctdb;
1147 struct ctdb_marshall_buffer *recdata;
1152 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1154 struct recdb_data *params = (struct recdb_data *)p;
1155 struct ctdb_rec_data *rec;
1156 struct ctdb_ltdb_header *hdr;
1158 /* skip empty records */
1159 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1163 /* update the dmaster field to point to us */
1164 hdr = (struct ctdb_ltdb_header *)data.dptr;
1165 hdr->dmaster = params->ctdb->pnn;
1167 /* add the record to the blob ready to send to the nodes */
1168 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1170 params->failed = true;
1173 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1174 if (params->recdata == NULL) {
1175 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1176 rec->length + params->len, params->recdata->count));
1177 params->failed = true;
1180 params->recdata->count++;
1181 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1182 params->len += rec->length;
1189 push the recdb database out to all nodes
1191 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1192 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1194 struct recdb_data params;
1195 struct ctdb_marshall_buffer *recdata;
1197 TALLOC_CTX *tmp_ctx;
1200 tmp_ctx = talloc_new(ctdb);
1201 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1203 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1204 CTDB_NO_MEMORY(ctdb, recdata);
1206 recdata->db_id = dbid;
1209 params.recdata = recdata;
1210 params.len = offsetof(struct ctdb_marshall_buffer, data);
1211 params.failed = false;
1213 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1214 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1215 talloc_free(params.recdata);
1216 talloc_free(tmp_ctx);
1220 if (params.failed) {
1221 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1222 talloc_free(params.recdata);
1223 talloc_free(tmp_ctx);
1227 recdata = params.recdata;
1229 outdata.dptr = (void *)recdata;
1230 outdata.dsize = params.len;
1232 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1233 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1235 CONTROL_TIMEOUT(), false, outdata,
1238 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1239 talloc_free(recdata);
1240 talloc_free(tmp_ctx);
1244 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1245 dbid, recdata->count));
1247 talloc_free(recdata);
1248 talloc_free(tmp_ctx);
1255 go through a full recovery on one database
1257 static int recover_database(struct ctdb_recoverd *rec,
1258 TALLOC_CTX *mem_ctx,
1261 struct ctdb_node_map *nodemap,
1262 uint32_t transaction_id)
1264 struct tdb_wrap *recdb;
1266 struct ctdb_context *ctdb = rec->ctdb;
1268 struct ctdb_control_wipe_database w;
1271 recdb = create_recdb(ctdb, mem_ctx);
1272 if (recdb == NULL) {
1276 /* pull all remote databases onto the recdb */
1277 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1279 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1283 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1285 /* wipe all the remote databases. This is safe as we are in a transaction */
1287 w.transaction_id = transaction_id;
1289 data.dptr = (void *)&w;
1290 data.dsize = sizeof(w);
1292 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1293 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1295 CONTROL_TIMEOUT(), false, data,
1298 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1303 /* push out the correct database. This sets the dmaster and skips
1304 the empty records */
1305 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1311 /* all done with this database */
1318 reload the nodes file
1320 static void reload_nodes_file(struct ctdb_context *ctdb)
1323 ctdb_load_nodes_file(ctdb);
1328 we are the recmaster, and recovery is needed - start a recovery run
1330 static int do_recovery(struct ctdb_recoverd *rec,
1331 TALLOC_CTX *mem_ctx, uint32_t pnn,
1332 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap,
1335 struct ctdb_context *ctdb = rec->ctdb;
1337 uint32_t generation;
1338 struct ctdb_dbid_map *dbmap;
1341 struct timeval start_time;
1343 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1345 /* if recovery fails, force it again */
1346 rec->need_recovery = true;
1348 if (culprit != -1) {
1349 ctdb_set_culprit(rec, culprit);
1352 if (rec->culprit_counter > 2*nodemap->num) {
1353 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n",
1354 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
1355 ctdb->tunable.recovery_ban_period));
1356 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
1360 if (ctdb->tunable.verify_recovery_lock != 0) {
1361 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1362 start_time = timeval_current();
1363 if (!ctdb_recovery_lock(ctdb, true)) {
1364 ctdb_set_culprit(rec, pnn);
1365 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1368 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1369 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1372 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit));
1374 /* get a list of all databases */
1375 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1381 /* we do the db creation before we set the recovery mode, so the freeze happens
1382 on all databases we will be dealing with. */
1384 /* verify that we have all the databases any other node has */
1385 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1387 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1391 /* verify that all other nodes have all our databases */
1392 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1394 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1398 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1401 /* set recovery mode to active on all nodes */
1402 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1404 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1408 /* execute the "startrecovery" event script on all nodes */
1409 ret = run_startrecovery_eventscript(rec, nodemap);
1411 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1415 /* pick a new generation number */
1416 generation = new_generation();
1418 /* change the vnnmap on this node to use the new generation
1419 number but not on any other nodes.
1420 this guarantees that if we abort the recovery prematurely
1421 for some reason (a node stops responding?)
1422 that we can just return immediately and we will reenter
1423 recovery shortly again.
1424 I.e. we deliberately leave the cluster with an inconsistent
1425 generation id to allow us to abort recovery at any stage and
1426 just restart it from scratch.
1428 vnnmap->generation = generation;
1429 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1431 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1435 data.dptr = (void *)&generation;
1436 data.dsize = sizeof(uint32_t);
1438 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1439 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1441 CONTROL_TIMEOUT(), false, data,
1444 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1448 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1450 for (i=0;i<dbmap->num;i++) {
1451 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1452 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1457 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1459 /* commit all the changes */
1460 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1462 CONTROL_TIMEOUT(), false, data,
1465 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1469 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1472 /* update the capabilities for all nodes */
1473 ret = update_capabilities(ctdb, nodemap);
1475 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1479 /* build a new vnn map with all the currently active and
1481 generation = new_generation();
1482 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1483 CTDB_NO_MEMORY(ctdb, vnnmap);
1484 vnnmap->generation = generation;
1486 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1487 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1488 for (i=j=0;i<nodemap->num;i++) {
1489 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1492 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1493 /* this node can not be an lmaster */
1494 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1499 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1500 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1501 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1504 if (vnnmap->size == 0) {
1505 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1507 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1508 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1509 vnnmap->map[0] = pnn;
1512 /* update to the new vnnmap on all nodes */
1513 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1515 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1519 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1521 /* update recmaster to point to us for all nodes */
1522 ret = set_recovery_master(ctdb, nodemap, pnn);
1524 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1528 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1531 update all nodes to have the same flags that we have
1533 for (i=0;i<nodemap->num;i++) {
1534 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1538 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1540 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1545 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1547 /* disable recovery mode */
1548 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_NORMAL);
1550 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1554 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1557 tell nodes to takeover their public IPs
1559 rec->need_takeover_run = false;
1560 ret = ctdb_takeover_run(ctdb, nodemap);
1562 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1565 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1567 /* execute the "recovered" event script on all nodes */
1568 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1570 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1574 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1576 /* send a message to all clients telling them that the cluster
1577 has been reconfigured */
1578 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1580 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1582 rec->need_recovery = false;
1584 /* We just finished a recovery successfully.
1585 We now wait for rerecovery_timeout before we allow
1586 another recovery to take place.
1588 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1589 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1590 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1597 elections are won by first checking the number of connected nodes, then
1598 the priority time, then the pnn
1600 struct election_message {
1601 uint32_t num_connected;
1602 struct timeval priority_time;
1604 uint32_t node_flags;
1608 form this nodes election data
1610 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1613 struct ctdb_node_map *nodemap;
1614 struct ctdb_context *ctdb = rec->ctdb;
1618 em->pnn = rec->ctdb->pnn;
1619 em->priority_time = rec->priority_time;
1620 em->node_flags = rec->node_flags;
1622 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1624 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1628 for (i=0;i<nodemap->num;i++) {
1629 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1630 em->num_connected++;
1634 /* we shouldnt try to win this election if we cant be a recmaster */
1635 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1636 em->num_connected = 0;
1637 em->priority_time = timeval_current();
1640 talloc_free(nodemap);
1644 see if the given election data wins
1646 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1648 struct election_message myem;
1651 ctdb_election_data(rec, &myem);
1653 /* we cant win if we dont have the recmaster capability */
1654 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1658 /* we cant win if we are banned */
1659 if (rec->node_flags & NODE_FLAGS_BANNED) {
1663 /* we will automatically win if the other node is banned */
1664 if (em->node_flags & NODE_FLAGS_BANNED) {
1668 /* try to use the most connected node */
1670 cmp = (int)myem.num_connected - (int)em->num_connected;
1673 /* then the longest running node */
1675 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1679 cmp = (int)myem.pnn - (int)em->pnn;
1686 send out an election request
1688 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1691 TDB_DATA election_data;
1692 struct election_message emsg;
1694 struct ctdb_context *ctdb = rec->ctdb;
1696 srvid = CTDB_SRVID_RECOVERY;
1698 ctdb_election_data(rec, &emsg);
1700 election_data.dsize = sizeof(struct election_message);
1701 election_data.dptr = (unsigned char *)&emsg;
1704 /* send an election message to all active nodes */
1705 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1708 /* A new node that is already frozen has entered the cluster.
1709 The existing nodes are not frozen and dont need to be frozen
1710 until the election has ended and we start the actual recovery
1712 if (update_recmaster == true) {
1713 /* first we assume we will win the election and set
1714 recoverymaster to be ourself on the current node
1716 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1718 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1728 this function will unban all nodes in the cluster
1730 static void unban_all_nodes(struct ctdb_context *ctdb)
1733 struct ctdb_node_map *nodemap;
1734 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1736 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1738 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1742 for (i=0;i<nodemap->num;i++) {
1743 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1744 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1745 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1749 talloc_free(tmp_ctx);
1754 we think we are winning the election - send a broadcast election request
1756 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1758 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1761 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1763 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1766 talloc_free(rec->send_election_te);
1767 rec->send_election_te = NULL;
1771 handler for memory dumps
1773 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1774 TDB_DATA data, void *private_data)
1776 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1779 struct rd_memdump_reply *rd;
1781 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1782 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1783 talloc_free(tmp_ctx);
1786 rd = (struct rd_memdump_reply *)data.dptr;
1788 dump = talloc_zero(tmp_ctx, TDB_DATA);
1790 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1791 talloc_free(tmp_ctx);
1794 ret = ctdb_dump_memory(ctdb, dump);
1796 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1797 talloc_free(tmp_ctx);
1801 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1803 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1805 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1806 talloc_free(tmp_ctx);
1810 talloc_free(tmp_ctx);
1814 handler for reload_nodes
1816 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1817 TDB_DATA data, void *private_data)
1819 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1821 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1823 reload_nodes_file(rec->ctdb);
1827 handler for ip reallocate, just add it to the list of callers and
1828 handle this later in the monitor_cluster loop so we do not recurse
1829 with other callers to takeover_run()
1831 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1832 TDB_DATA data, void *private_data)
1834 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1835 struct ip_reallocate_list *caller;
1837 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1838 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1842 if (rec->ip_reallocate_ctx == NULL) {
1843 rec->ip_reallocate_ctx = talloc_new(rec);
1844 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1847 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1848 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1850 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1851 caller->next = rec->reallocate_callers;
1852 rec->reallocate_callers = caller;
1857 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1859 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1862 struct ip_reallocate_list *callers;
1864 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1865 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1866 result.dsize = sizeof(int32_t);
1867 result.dptr = (uint8_t *)&ret;
1869 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1870 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1871 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1873 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1877 talloc_free(tmp_ctx);
1878 talloc_free(rec->ip_reallocate_ctx);
1879 rec->ip_reallocate_ctx = NULL;
1880 rec->reallocate_callers = NULL;
1886 handler for recovery master elections
1888 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1889 TDB_DATA data, void *private_data)
1891 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1893 struct election_message *em = (struct election_message *)data.dptr;
1894 TALLOC_CTX *mem_ctx;
1896 /* we got an election packet - update the timeout for the election */
1897 talloc_free(rec->election_timeout);
1898 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1899 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1900 ctdb_election_timeout, rec);
1902 mem_ctx = talloc_new(ctdb);
1904 /* someone called an election. check their election data
1905 and if we disagree and we would rather be the elected node,
1906 send a new election message to all other nodes
1908 if (ctdb_election_win(rec, em)) {
1909 if (!rec->send_election_te) {
1910 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1911 timeval_current_ofs(0, 500000),
1912 election_send_request, rec);
1914 talloc_free(mem_ctx);
1915 /*unban_all_nodes(ctdb);*/
1920 talloc_free(rec->send_election_te);
1921 rec->send_election_te = NULL;
1923 if (ctdb->tunable.verify_recovery_lock != 0) {
1924 /* release the recmaster lock */
1925 if (em->pnn != ctdb->pnn &&
1926 ctdb->recovery_lock_fd != -1) {
1927 close(ctdb->recovery_lock_fd);
1928 ctdb->recovery_lock_fd = -1;
1929 unban_all_nodes(ctdb);
1933 /* ok, let that guy become recmaster then */
1934 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1936 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1937 talloc_free(mem_ctx);
1941 /* release any bans */
1942 rec->last_culprit = (uint32_t)-1;
1943 talloc_free(rec->banned_nodes);
1944 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
1945 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
1947 talloc_free(mem_ctx);
1953 force the start of the election process
1955 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1956 struct ctdb_node_map *nodemap)
1959 struct ctdb_context *ctdb = rec->ctdb;
1961 /* set all nodes to recovery mode to stop all internode traffic */
1962 ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE);
1964 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1968 talloc_free(rec->election_timeout);
1969 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1970 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1971 ctdb_election_timeout, rec);
1973 ret = send_election_request(rec, pnn, true);
1975 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1979 /* wait for a few seconds to collect all responses */
1980 ctdb_wait_election(rec);
1986 handler for when a node changes its flags
1988 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1989 TDB_DATA data, void *private_data)
1992 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1993 struct ctdb_node_map *nodemap=NULL;
1994 TALLOC_CTX *tmp_ctx;
1995 uint32_t changed_flags;
1997 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1999 if (data.dsize != sizeof(*c)) {
2000 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2004 tmp_ctx = talloc_new(ctdb);
2005 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2007 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2009 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2010 talloc_free(tmp_ctx);
2015 for (i=0;i<nodemap->num;i++) {
2016 if (nodemap->nodes[i].pnn == c->pnn) break;
2019 if (i == nodemap->num) {
2020 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2021 talloc_free(tmp_ctx);
2025 changed_flags = c->old_flags ^ c->new_flags;
2027 if (nodemap->nodes[i].flags != c->new_flags) {
2028 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2031 nodemap->nodes[i].flags = c->new_flags;
2033 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2034 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2037 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2038 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2042 ctdb->recovery_master == ctdb->pnn &&
2043 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2044 /* Only do the takeover run if the perm disabled or unhealthy
2045 flags changed since these will cause an ip failover but not
2047 If the node became disconnected or banned this will also
2048 lead to an ip address failover but that is handled
2051 if (changed_flags & NODE_FLAGS_DISABLED) {
2052 rec->need_takeover_run = true;
2056 talloc_free(tmp_ctx);
2060 handler for when we need to push out flag changes ot all other nodes
2062 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2063 TDB_DATA data, void *private_data)
2066 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2068 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2070 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2075 struct verify_recmode_normal_data {
2077 enum monitor_result status;
2080 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2082 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2085 /* one more node has responded with recmode data*/
2088 /* if we failed to get the recmode, then return an error and let
2089 the main loop try again.
2091 if (state->state != CTDB_CONTROL_DONE) {
2092 if (rmdata->status == MONITOR_OK) {
2093 rmdata->status = MONITOR_FAILED;
2098 /* if we got a response, then the recmode will be stored in the
2101 if (state->status != CTDB_RECOVERY_NORMAL) {
2102 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2103 rmdata->status = MONITOR_RECOVERY_NEEDED;
2110 /* verify that all nodes are in normal recovery mode */
2111 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2113 struct verify_recmode_normal_data *rmdata;
2114 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2115 struct ctdb_client_control_state *state;
2116 enum monitor_result status;
2119 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2120 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2122 rmdata->status = MONITOR_OK;
2124 /* loop over all active nodes and send an async getrecmode call to
2126 for (j=0; j<nodemap->num; j++) {
2127 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2130 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2132 nodemap->nodes[j].pnn);
2133 if (state == NULL) {
2134 /* we failed to send the control, treat this as
2135 an error and try again next iteration
2137 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2138 talloc_free(mem_ctx);
2139 return MONITOR_FAILED;
2142 /* set up the callback functions */
2143 state->async.fn = verify_recmode_normal_callback;
2144 state->async.private_data = rmdata;
2146 /* one more control to wait for to complete */
2151 /* now wait for up to the maximum number of seconds allowed
2152 or until all nodes we expect a response from has replied
2154 while (rmdata->count > 0) {
2155 event_loop_once(ctdb->ev);
2158 status = rmdata->status;
2159 talloc_free(mem_ctx);
2164 struct verify_recmaster_data {
2165 struct ctdb_recoverd *rec;
2168 enum monitor_result status;
2171 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2173 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2176 /* one more node has responded with recmaster data*/
2179 /* if we failed to get the recmaster, then return an error and let
2180 the main loop try again.
2182 if (state->state != CTDB_CONTROL_DONE) {
2183 if (rmdata->status == MONITOR_OK) {
2184 rmdata->status = MONITOR_FAILED;
2189 /* if we got a response, then the recmaster will be stored in the
2192 if (state->status != rmdata->pnn) {
2193 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2194 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2195 rmdata->status = MONITOR_ELECTION_NEEDED;
2202 /* verify that all nodes agree that we are the recmaster */
2203 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2205 struct ctdb_context *ctdb = rec->ctdb;
2206 struct verify_recmaster_data *rmdata;
2207 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2208 struct ctdb_client_control_state *state;
2209 enum monitor_result status;
2212 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2213 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2217 rmdata->status = MONITOR_OK;
2219 /* loop over all active nodes and send an async getrecmaster call to
2221 for (j=0; j<nodemap->num; j++) {
2222 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2225 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2227 nodemap->nodes[j].pnn);
2228 if (state == NULL) {
2229 /* we failed to send the control, treat this as
2230 an error and try again next iteration
2232 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2233 talloc_free(mem_ctx);
2234 return MONITOR_FAILED;
2237 /* set up the callback functions */
2238 state->async.fn = verify_recmaster_callback;
2239 state->async.private_data = rmdata;
2241 /* one more control to wait for to complete */
2246 /* now wait for up to the maximum number of seconds allowed
2247 or until all nodes we expect a response from has replied
2249 while (rmdata->count > 0) {
2250 event_loop_once(ctdb->ev);
2253 status = rmdata->status;
2254 talloc_free(mem_ctx);
2259 /* called to check that the allocation of public ip addresses is ok.
2261 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2263 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2264 struct ctdb_all_public_ips *ips = NULL;
2265 struct ctdb_uptime *uptime1 = NULL;
2266 struct ctdb_uptime *uptime2 = NULL;
2269 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2270 CTDB_CURRENT_NODE, &uptime1);
2272 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2273 talloc_free(mem_ctx);
2277 /* read the ip allocation from the local node */
2278 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2280 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2281 talloc_free(mem_ctx);
2285 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2286 CTDB_CURRENT_NODE, &uptime2);
2288 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2289 talloc_free(mem_ctx);
2293 /* skip the check if the startrecovery time has changed */
2294 if (timeval_compare(&uptime1->last_recovery_started,
2295 &uptime2->last_recovery_started) != 0) {
2296 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2297 talloc_free(mem_ctx);
2301 /* skip the check if the endrecovery time has changed */
2302 if (timeval_compare(&uptime1->last_recovery_finished,
2303 &uptime2->last_recovery_finished) != 0) {
2304 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2305 talloc_free(mem_ctx);
2309 /* skip the check if we have started but not finished recovery */
2310 if (timeval_compare(&uptime1->last_recovery_finished,
2311 &uptime1->last_recovery_started) != 1) {
2312 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2313 talloc_free(mem_ctx);
2318 /* verify that we have the ip addresses we should have
2319 and we dont have ones we shouldnt have.
2320 if we find an inconsistency we set recmode to
2321 active on the local node and wait for the recmaster
2322 to do a full blown recovery
2324 for (j=0; j<ips->num; j++) {
2325 if (ips->ips[j].pnn == pnn) {
2326 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2327 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2328 ctdb_addr_to_str(&ips->ips[j].addr)));
2329 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2331 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2333 talloc_free(mem_ctx);
2336 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2338 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2340 talloc_free(mem_ctx);
2345 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2346 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2347 ctdb_addr_to_str(&ips->ips[j].addr)));
2349 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2351 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2353 talloc_free(mem_ctx);
2356 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2358 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2360 talloc_free(mem_ctx);
2367 talloc_free(mem_ctx);
2372 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2374 struct ctdb_node_map **remote_nodemaps = callback_data;
2376 if (node_pnn >= ctdb->num_nodes) {
2377 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2381 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2385 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2386 struct ctdb_node_map *nodemap,
2387 struct ctdb_node_map **remote_nodemaps)
2391 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2392 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2394 CONTROL_TIMEOUT(), false, tdb_null,
2395 async_getnodemap_callback,
2397 remote_nodemaps) != 0) {
2398 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2406 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2407 struct ctdb_check_reclock_state {
2408 struct ctdb_context *ctdb;
2409 struct timeval start_time;
2412 struct timed_event *te;
2413 struct fd_event *fde;
2414 enum reclock_child_status status;
2417 /* when we free the reclock state we must kill any child process.
2419 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2421 struct ctdb_context *ctdb = state->ctdb;
2423 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2425 if (state->fd[0] != -1) {
2426 close(state->fd[0]);
2429 if (state->fd[1] != -1) {
2430 close(state->fd[1]);
2433 kill(state->child, SIGKILL);
2438 called if our check_reclock child times out. this would happen if
2439 i/o to the reclock file blocks.
2441 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2442 struct timeval t, void *private_data)
2444 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2445 struct ctdb_check_reclock_state);
2447 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2448 state->status = RECLOCK_TIMEOUT;
2451 /* this is called when the child process has completed checking the reclock
2452 file and has written data back to us through the pipe.
2454 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2455 uint16_t flags, void *private_data)
2457 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2458 struct ctdb_check_reclock_state);
2462 /* we got a response from our child process so we can abort the
2465 talloc_free(state->te);
2468 ret = read(state->fd[0], &c, 1);
2469 if (ret != 1 || c != RECLOCK_OK) {
2470 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2471 state->status = RECLOCK_FAILED;
2476 state->status = RECLOCK_OK;
2480 static int check_recovery_lock(struct ctdb_context *ctdb)
2483 struct ctdb_check_reclock_state *state;
2484 pid_t parent = getpid();
2486 if (ctdb->recovery_lock_fd == -1) {
2487 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2491 state = talloc(ctdb, struct ctdb_check_reclock_state);
2492 CTDB_NO_MEMORY(ctdb, state);
2495 state->start_time = timeval_current();
2496 state->status = RECLOCK_CHECKING;
2500 ret = pipe(state->fd);
2503 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2507 state->child = fork();
2508 if (state->child == (pid_t)-1) {
2509 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2510 close(state->fd[0]);
2512 close(state->fd[1]);
2518 if (state->child == 0) {
2519 char cc = RECLOCK_OK;
2520 close(state->fd[0]);
2523 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2524 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2525 cc = RECLOCK_FAILED;
2528 write(state->fd[1], &cc, 1);
2529 /* make sure we die when our parent dies */
2530 while (kill(parent, 0) == 0 || errno != ESRCH) {
2532 write(state->fd[1], &cc, 1);
2536 close(state->fd[1]);
2539 talloc_set_destructor(state, check_reclock_destructor);
2541 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2542 ctdb_check_reclock_timeout, state);
2543 if (state->te == NULL) {
2544 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2549 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2550 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2551 reclock_child_handler,
2554 if (state->fde == NULL) {
2555 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2560 while (state->status == RECLOCK_CHECKING) {
2561 event_loop_once(ctdb->ev);
2564 if (state->status == RECLOCK_FAILED) {
2565 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2566 close(ctdb->recovery_lock_fd);
2567 ctdb->recovery_lock_fd = -1;
2576 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2578 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2579 const char *reclockfile;
2581 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2582 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2583 talloc_free(tmp_ctx);
2587 if (reclockfile == NULL) {
2588 if (ctdb->recovery_lock_file != NULL) {
2589 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2590 talloc_free(ctdb->recovery_lock_file);
2591 ctdb->recovery_lock_file = NULL;
2592 if (ctdb->recovery_lock_fd != -1) {
2593 close(ctdb->recovery_lock_fd);
2594 ctdb->recovery_lock_fd = -1;
2597 ctdb->tunable.verify_recovery_lock = 0;
2598 talloc_free(tmp_ctx);
2602 if (ctdb->recovery_lock_file == NULL) {
2603 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2604 if (ctdb->recovery_lock_fd != -1) {
2605 close(ctdb->recovery_lock_fd);
2606 ctdb->recovery_lock_fd = -1;
2608 talloc_free(tmp_ctx);
2613 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2614 talloc_free(tmp_ctx);
2618 talloc_free(ctdb->recovery_lock_file);
2619 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2620 ctdb->tunable.verify_recovery_lock = 0;
2621 if (ctdb->recovery_lock_fd != -1) {
2622 close(ctdb->recovery_lock_fd);
2623 ctdb->recovery_lock_fd = -1;
2626 talloc_free(tmp_ctx);
2631 the main monitoring loop
2633 static void monitor_cluster(struct ctdb_context *ctdb)
2636 TALLOC_CTX *mem_ctx=NULL;
2637 struct ctdb_node_map *nodemap=NULL;
2638 struct ctdb_node_map *recmaster_nodemap=NULL;
2639 struct ctdb_node_map **remote_nodemaps=NULL;
2640 struct ctdb_vnn_map *vnnmap=NULL;
2641 struct ctdb_vnn_map *remote_vnnmap=NULL;
2642 int32_t debug_level;
2644 struct ctdb_recoverd *rec;
2646 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2648 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2649 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2652 rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes);
2653 CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes);
2655 rec->priority_time = timeval_current();
2657 /* register a message port for sending memory dumps */
2658 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2660 /* register a message port for recovery elections */
2661 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2663 /* when nodes are disabled/enabled */
2664 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2666 /* when we are asked to puch out a flag change */
2667 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2669 /* when nodes are banned */
2670 ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec);
2672 /* and one for when nodes are unbanned */
2673 ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
2675 /* register a message port for vacuum fetch */
2676 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2678 /* register a message port for reloadnodes */
2679 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2681 /* register a message port for performing a takeover run */
2682 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2686 talloc_free(mem_ctx);
2689 mem_ctx = talloc_new(ctdb);
2691 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2695 /* we only check for recovery once every second */
2696 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2698 /* verify that the main daemon is still running */
2699 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2700 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2704 /* ping the local daemon to tell it we are alive */
2705 ctdb_ctrl_recd_ping(ctdb);
2707 if (rec->election_timeout) {
2708 /* an election is in progress */
2712 /* read the debug level from the parent and update locally */
2713 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2715 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2718 LogLevel = debug_level;
2721 /* We must check if we need to ban a node here but we want to do this
2722 as early as possible so we dont wait until we have pulled the node
2723 map from the local node. thats why we have the hardcoded value 20
2725 if (rec->culprit_counter > 20) {
2726 DEBUG(DEBUG_NOTICE,("Node %u has caused %u failures in %.0f seconds - banning it for %u seconds\n",
2727 rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time),
2728 ctdb->tunable.recovery_ban_period));
2729 ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period);
2732 /* get relevant tunables */
2733 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2735 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2739 /* get the current recovery lock file from the server */
2740 if (update_recovery_lock_file(ctdb) != 0) {
2741 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2745 /* Make sure that if recovery lock verification becomes disabled when
2748 if (ctdb->tunable.verify_recovery_lock == 0) {
2749 if (ctdb->recovery_lock_fd != -1) {
2750 close(ctdb->recovery_lock_fd);
2751 ctdb->recovery_lock_fd = -1;
2755 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2756 if (pnn == (uint32_t)-1) {
2757 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2761 /* get the vnnmap */
2762 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2764 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2769 /* get number of nodes */
2771 talloc_free(rec->nodemap);
2772 rec->nodemap = NULL;
2775 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2777 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2780 nodemap = rec->nodemap;
2782 /* check which node is the recovery master */
2783 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2785 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2789 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2790 if (rec->recmaster != pnn) {
2791 if (rec->ip_reallocate_ctx != NULL) {
2792 talloc_free(rec->ip_reallocate_ctx);
2793 rec->ip_reallocate_ctx = NULL;
2794 rec->reallocate_callers = NULL;
2797 /* if there are takeovers requested, perform it and notify the waiters */
2798 if (rec->reallocate_callers) {
2799 process_ipreallocate_requests(ctdb, rec);
2802 if (rec->recmaster == (uint32_t)-1) {
2803 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2804 force_election(rec, pnn, nodemap);
2808 /* check that we (recovery daemon) and the local ctdb daemon
2809 agrees on whether we are banned or not
2811 if (nodemap->nodes[pnn].flags & NODE_FLAGS_BANNED) {
2812 if (rec->banned_nodes[pnn] == NULL) {
2813 if (rec->recmaster == pnn) {
2814 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster thinks this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2816 ctdb_unban_node(rec, pnn);
2818 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster thinks this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2819 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2820 ctdb_set_culprit(rec, pnn);
2825 if (rec->banned_nodes[pnn] != NULL) {
2826 if (rec->recmaster == pnn) {
2827 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node\n"));
2829 ctdb_unban_node(rec, pnn);
2831 DEBUG(DEBUG_NOTICE,("Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node\n"));
2833 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2834 ctdb_set_culprit(rec, pnn);
2840 /* remember our own node flags */
2841 rec->node_flags = nodemap->nodes[pnn].flags;
2843 /* count how many active nodes there are */
2844 rec->num_active = 0;
2845 rec->num_connected = 0;
2846 for (i=0; i<nodemap->num; i++) {
2847 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2850 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2851 rec->num_connected++;
2856 /* verify that the recmaster node is still active */
2857 for (j=0; j<nodemap->num; j++) {
2858 if (nodemap->nodes[j].pnn==rec->recmaster) {
2863 if (j == nodemap->num) {
2864 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2865 force_election(rec, pnn, nodemap);
2869 /* if recovery master is disconnected we must elect a new recmaster */
2870 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2871 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2872 force_election(rec, pnn, nodemap);
2876 /* grap the nodemap from the recovery master to check if it is banned */
2877 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2878 mem_ctx, &recmaster_nodemap);
2880 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2881 nodemap->nodes[j].pnn));
2886 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2887 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2888 force_election(rec, pnn, nodemap);
2893 /* verify that we have all ip addresses we should have and we dont
2894 * have addresses we shouldnt have.
2896 if (ctdb->do_checkpublicip) {
2897 if (verify_ip_allocation(ctdb, pnn) != 0) {
2898 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2904 /* if we are not the recmaster then we do not need to check
2905 if recovery is needed
2907 if (pnn != rec->recmaster) {
2912 /* ensure our local copies of flags are right */
2913 ret = update_local_flags(rec, nodemap);
2914 if (ret == MONITOR_ELECTION_NEEDED) {
2915 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2916 force_election(rec, pnn, nodemap);
2919 if (ret != MONITOR_OK) {
2920 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2924 /* update the list of public ips that a node can handle for
2927 if (ctdb->num_nodes != nodemap->num) {
2928 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2929 reload_nodes_file(ctdb);
2932 for (j=0; j<nodemap->num; j++) {
2933 /* release any existing data */
2934 if (ctdb->nodes[j]->public_ips) {
2935 talloc_free(ctdb->nodes[j]->public_ips);
2936 ctdb->nodes[j]->public_ips = NULL;
2939 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2943 /* grab a new shiny list of public ips from the node */
2944 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2945 ctdb->nodes[j]->pnn,
2947 &ctdb->nodes[j]->public_ips)) {
2948 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2949 ctdb->nodes[j]->pnn));
2955 /* verify that all active nodes agree that we are the recmaster */
2956 switch (verify_recmaster(rec, nodemap, pnn)) {
2957 case MONITOR_RECOVERY_NEEDED:
2958 /* can not happen */
2960 case MONITOR_ELECTION_NEEDED:
2961 force_election(rec, pnn, nodemap);
2965 case MONITOR_FAILED:
2970 if (rec->need_recovery) {
2971 /* a previous recovery didn't finish */
2972 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, -1);
2976 /* verify that all active nodes are in normal mode
2977 and not in recovery mode
2979 switch (verify_recmode(ctdb, nodemap)) {
2980 case MONITOR_RECOVERY_NEEDED:
2981 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
2983 case MONITOR_FAILED:
2985 case MONITOR_ELECTION_NEEDED:
2986 /* can not happen */
2992 if (ctdb->tunable.verify_recovery_lock != 0) {
2993 /* we should have the reclock - check its not stale */
2994 ret = check_recovery_lock(ctdb);
2996 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
2997 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
3002 /* get the nodemap for all active remote nodes
3004 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3005 if (remote_nodemaps == NULL) {
3006 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3009 for(i=0; i<nodemap->num; i++) {
3010 remote_nodemaps[i] = NULL;
3012 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3013 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3017 /* verify that all other nodes have the same nodemap as we have
3019 for (j=0; j<nodemap->num; j++) {
3020 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3024 if (remote_nodemaps[j] == NULL) {
3025 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3026 ctdb_set_culprit(rec, j);
3031 /* if the nodes disagree on how many nodes there are
3032 then this is a good reason to try recovery
3034 if (remote_nodemaps[j]->num != nodemap->num) {
3035 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3036 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3037 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
3041 /* if the nodes disagree on which nodes exist and are
3042 active, then that is also a good reason to do recovery
3044 for (i=0;i<nodemap->num;i++) {
3045 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3046 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3047 nodemap->nodes[j].pnn, i,
3048 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3049 do_recovery(rec, mem_ctx, pnn, nodemap,
3050 vnnmap, nodemap->nodes[j].pnn);
3055 /* verify the flags are consistent
3057 for (i=0; i<nodemap->num; i++) {
3058 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3062 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3063 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3064 nodemap->nodes[j].pnn,
3065 nodemap->nodes[i].pnn,
3066 remote_nodemaps[j]->nodes[i].flags,
3067 nodemap->nodes[j].flags));
3069 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3070 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3071 do_recovery(rec, mem_ctx, pnn, nodemap,
3072 vnnmap, nodemap->nodes[j].pnn);
3075 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3076 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3077 do_recovery(rec, mem_ctx, pnn, nodemap,
3078 vnnmap, nodemap->nodes[j].pnn);
3086 /* there better be the same number of lmasters in the vnn map
3087 as there are active nodes or we will have to do a recovery
3089 if (vnnmap->size != rec->num_active) {
3090 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3091 vnnmap->size, rec->num_active));
3092 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, ctdb->pnn);
3096 /* verify that all active nodes in the nodemap also exist in
3099 for (j=0; j<nodemap->num; j++) {
3100 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3103 if (nodemap->nodes[j].pnn == pnn) {
3107 for (i=0; i<vnnmap->size; i++) {
3108 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3112 if (i == vnnmap->size) {
3113 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3114 nodemap->nodes[j].pnn));
3115 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
3121 /* verify that all other nodes have the same vnnmap
3122 and are from the same generation
3124 for (j=0; j<nodemap->num; j++) {
3125 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3128 if (nodemap->nodes[j].pnn == pnn) {
3132 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3133 mem_ctx, &remote_vnnmap);
3135 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3136 nodemap->nodes[j].pnn));
3140 /* verify the vnnmap generation is the same */
3141 if (vnnmap->generation != remote_vnnmap->generation) {
3142 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3143 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3144 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
3148 /* verify the vnnmap size is the same */
3149 if (vnnmap->size != remote_vnnmap->size) {
3150 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3151 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3152 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap, nodemap->nodes[j].pnn);
3156 /* verify the vnnmap is the same */
3157 for (i=0;i<vnnmap->size;i++) {
3158 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3159 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3160 nodemap->nodes[j].pnn));
3161 do_recovery(rec, mem_ctx, pnn, nodemap,
3162 vnnmap, nodemap->nodes[j].pnn);
3168 /* we might need to change who has what IP assigned */
3169 if (rec->need_takeover_run) {
3170 rec->need_takeover_run = false;
3172 /* execute the "startrecovery" event script on all nodes */
3173 ret = run_startrecovery_eventscript(rec, nodemap);
3175 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3176 do_recovery(rec, mem_ctx, pnn, nodemap,
3180 ret = ctdb_takeover_run(ctdb, nodemap);
3182 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3183 do_recovery(rec, mem_ctx, pnn, nodemap,
3187 /* execute the "recovered" event script on all nodes */
3188 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3190 // we cant check whether the event completed successfully
3191 // since this script WILL fail if the node is in recovery mode
3192 // and if that race happens, the code here would just cause a second
3193 // cascading recovery.
3195 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3196 do_recovery(rec, mem_ctx, pnn, nodemap,
3208 event handler for when the main ctdbd dies
3210 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3211 uint16_t flags, void *private_data)
3213 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3218 called regularly to verify that the recovery daemon is still running
3220 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3221 struct timeval yt, void *p)
3223 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3225 if (kill(ctdb->recoverd_pid, 0) != 0) {
3226 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3228 ctdb_stop_recoverd(ctdb);
3229 ctdb_stop_keepalive(ctdb);
3230 ctdb_stop_monitoring(ctdb);
3231 ctdb_release_all_ips(ctdb);
3232 if (ctdb->methods != NULL) {
3233 ctdb->methods->shutdown(ctdb);
3235 ctdb_event_script(ctdb, "shutdown");
3240 event_add_timed(ctdb->ev, ctdb,
3241 timeval_current_ofs(30, 0),
3242 ctdb_check_recd, ctdb);
3245 static void recd_sig_child_handler(struct event_context *ev,
3246 struct signal_event *se, int signum, int count,
3250 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3255 pid = waitpid(-1, &status, WNOHANG);
3257 if (errno != ECHILD) {
3258 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3263 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3269 startup the recovery daemon as a child of the main ctdb daemon
3271 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3274 struct signal_event *se;
3276 if (pipe(fd) != 0) {
3280 ctdb->ctdbd_pid = getpid();
3282 ctdb->recoverd_pid = fork();
3283 if (ctdb->recoverd_pid == -1) {
3287 if (ctdb->recoverd_pid != 0) {
3289 event_add_timed(ctdb->ev, ctdb,
3290 timeval_current_ofs(30, 0),
3291 ctdb_check_recd, ctdb);
3297 srandom(getpid() ^ time(NULL));
3299 if (switch_from_server_to_client(ctdb) != 0) {
3300 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3304 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3305 ctdb_recoverd_parent, &fd[0]);
3307 /* set up a handler to pick up sigchld */
3308 se = event_add_signal(ctdb->ev, ctdb,
3310 recd_sig_child_handler,
3313 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3317 monitor_cluster(ctdb);
3319 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3324 shutdown the recovery daemon
3326 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3328 if (ctdb->recoverd_pid == 0) {
3332 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3333 kill(ctdb->recoverd_pid, SIGTERM);