4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid)
528 struct ctdb_marshall_buffer *reply;
529 struct ctdb_rec_data *rec;
531 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
533 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
534 CONTROL_TIMEOUT(), &outdata);
536 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
537 talloc_free(tmp_ctx);
541 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
543 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
544 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
545 talloc_free(tmp_ctx);
549 rec = (struct ctdb_rec_data *)&reply->data[0];
553 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
555 struct ctdb_ltdb_header *hdr;
558 key.dptr = &rec->data[0];
559 key.dsize = rec->keylen;
560 data.dptr = &rec->data[key.dsize];
561 data.dsize = rec->datalen;
563 hdr = (struct ctdb_ltdb_header *)data.dptr;
565 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
566 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
567 talloc_free(tmp_ctx);
571 /* fetch the existing record, if any */
572 existing = tdb_fetch(recdb->tdb, key);
574 if (existing.dptr != NULL) {
575 struct ctdb_ltdb_header header;
576 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
577 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
578 (unsigned)existing.dsize, srcnode));
580 talloc_free(tmp_ctx);
583 header = *(struct ctdb_ltdb_header *)existing.dptr;
585 if (!(header.rsn < hdr->rsn ||
586 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
591 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
592 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
593 talloc_free(tmp_ctx);
598 talloc_free(tmp_ctx);
604 pull all the remote database contents into the recdb
606 static int pull_remote_database(struct ctdb_context *ctdb,
607 struct ctdb_recoverd *rec,
608 struct ctdb_node_map *nodemap,
609 struct tdb_wrap *recdb, uint32_t dbid)
613 /* pull all records from all other nodes across onto this node
614 (this merges based on rsn)
616 for (j=0; j<nodemap->num; j++) {
617 /* dont merge from nodes that are unavailable */
618 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
621 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
622 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
623 nodemap->nodes[j].pnn));
624 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
634 update flags on all active nodes
636 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
640 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
642 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
650 ensure all nodes have the same vnnmap we do
652 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
653 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
657 /* push the new vnn map out to all the nodes */
658 for (j=0; j<nodemap->num; j++) {
659 /* dont push to nodes that are unavailable */
660 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
664 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
666 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
676 struct vacuum_info *next, *prev;
677 struct ctdb_recoverd *rec;
679 struct ctdb_db_context *ctdb_db;
680 struct ctdb_marshall_buffer *recs;
681 struct ctdb_rec_data *r;
684 static void vacuum_fetch_next(struct vacuum_info *v);
687 called when a vacuum fetch has completed - just free it and do the next one
689 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
691 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
693 vacuum_fetch_next(v);
698 process the next element from the vacuum list
700 static void vacuum_fetch_next(struct vacuum_info *v)
702 struct ctdb_call call;
703 struct ctdb_rec_data *r;
705 while (v->recs->count) {
706 struct ctdb_client_call_state *state;
708 struct ctdb_ltdb_header *hdr;
711 call.call_id = CTDB_NULL_FUNC;
712 call.flags = CTDB_IMMEDIATE_MIGRATION;
715 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
718 call.key.dptr = &r->data[0];
719 call.key.dsize = r->keylen;
721 /* ensure we don't block this daemon - just skip a record if we can't get
723 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
727 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
728 if (data.dptr == NULL) {
729 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
733 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
739 hdr = (struct ctdb_ltdb_header *)data.dptr;
740 if (hdr->dmaster == v->rec->ctdb->pnn) {
741 /* its already local */
743 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
749 state = ctdb_call_send(v->ctdb_db, &call);
750 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
752 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
756 state->async.fn = vacuum_fetch_callback;
757 state->async.private_data = v;
766 destroy a vacuum info structure
768 static int vacuum_info_destructor(struct vacuum_info *v)
770 DLIST_REMOVE(v->rec->vacuum_info, v);
776 handler for vacuum fetch
778 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
779 TDB_DATA data, void *private_data)
781 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
782 struct ctdb_marshall_buffer *recs;
784 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
786 struct ctdb_dbid_map *dbmap=NULL;
787 bool persistent = false;
788 struct ctdb_db_context *ctdb_db;
789 struct ctdb_rec_data *r;
791 struct vacuum_info *v;
793 recs = (struct ctdb_marshall_buffer *)data.dptr;
794 r = (struct ctdb_rec_data *)&recs->data[0];
796 if (recs->count == 0) {
797 talloc_free(tmp_ctx);
803 for (v=rec->vacuum_info;v;v=v->next) {
804 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
805 /* we're already working on records from this node */
806 talloc_free(tmp_ctx);
811 /* work out if the database is persistent */
812 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
814 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
815 talloc_free(tmp_ctx);
819 for (i=0;i<dbmap->num;i++) {
820 if (dbmap->dbs[i].dbid == recs->db_id) {
821 persistent = dbmap->dbs[i].persistent;
825 if (i == dbmap->num) {
826 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
827 talloc_free(tmp_ctx);
831 /* find the name of this database */
832 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
833 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
834 talloc_free(tmp_ctx);
839 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
840 if (ctdb_db == NULL) {
841 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
842 talloc_free(tmp_ctx);
846 v = talloc_zero(rec, struct vacuum_info);
848 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
849 talloc_free(tmp_ctx);
854 v->srcnode = srcnode;
855 v->ctdb_db = ctdb_db;
856 v->recs = talloc_memdup(v, recs, data.dsize);
857 if (v->recs == NULL) {
858 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
860 talloc_free(tmp_ctx);
863 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
865 DLIST_ADD(rec->vacuum_info, v);
867 talloc_set_destructor(v, vacuum_info_destructor);
869 vacuum_fetch_next(v);
870 talloc_free(tmp_ctx);
875 called when ctdb_wait_timeout should finish
877 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
878 struct timeval yt, void *p)
880 uint32_t *timed_out = (uint32_t *)p;
885 wait for a given number of seconds
887 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
889 uint32_t timed_out = 0;
890 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
892 event_loop_once(ctdb->ev);
897 called when an election times out (ends)
899 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
900 struct timeval t, void *p)
902 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
903 rec->election_timeout = NULL;
905 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
910 wait for an election to finish. It finished election_timeout seconds after
911 the last election packet is received
913 static void ctdb_wait_election(struct ctdb_recoverd *rec)
915 struct ctdb_context *ctdb = rec->ctdb;
916 while (rec->election_timeout) {
917 event_loop_once(ctdb->ev);
922 Update our local flags from all remote connected nodes.
923 This is only run when we are or we belive we are the recovery master
925 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
928 struct ctdb_context *ctdb = rec->ctdb;
929 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
931 /* get the nodemap for all active remote nodes and verify
932 they are the same as for this node
934 for (j=0; j<nodemap->num; j++) {
935 struct ctdb_node_map *remote_nodemap=NULL;
938 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
941 if (nodemap->nodes[j].pnn == ctdb->pnn) {
945 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
946 mem_ctx, &remote_nodemap);
948 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
949 nodemap->nodes[j].pnn));
950 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
951 talloc_free(mem_ctx);
952 return MONITOR_FAILED;
954 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
955 /* We should tell our daemon about this so it
956 updates its flags or else we will log the same
957 message again in the next iteration of recovery.
958 Since we are the recovery master we can just as
959 well update the flags on all nodes.
961 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
963 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
967 /* Update our local copy of the flags in the recovery
970 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
971 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
972 nodemap->nodes[j].flags));
973 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
975 talloc_free(remote_nodemap);
977 talloc_free(mem_ctx);
982 /* Create a new random generation ip.
983 The generation id can not be the INVALID_GENERATION id
985 static uint32_t new_generation(void)
990 generation = random();
992 if (generation != INVALID_GENERATION) {
1002 create a temporary working database
1004 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1007 struct tdb_wrap *recdb;
1010 /* open up the temporary recovery database */
1011 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1017 tdb_flags = TDB_NOLOCK;
1018 if (!ctdb->do_setsched) {
1019 tdb_flags |= TDB_NOMMAP;
1022 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1023 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1024 if (recdb == NULL) {
1025 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1035 a traverse function for pulling all relevent records from recdb
1038 struct ctdb_context *ctdb;
1039 struct ctdb_marshall_buffer *recdata;
1044 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1046 struct recdb_data *params = (struct recdb_data *)p;
1047 struct ctdb_rec_data *rec;
1048 struct ctdb_ltdb_header *hdr;
1050 /* skip empty records */
1051 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1055 /* update the dmaster field to point to us */
1056 hdr = (struct ctdb_ltdb_header *)data.dptr;
1057 hdr->dmaster = params->ctdb->pnn;
1059 /* add the record to the blob ready to send to the nodes */
1060 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1062 params->failed = true;
1065 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1066 if (params->recdata == NULL) {
1067 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1068 rec->length + params->len, params->recdata->count));
1069 params->failed = true;
1072 params->recdata->count++;
1073 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1074 params->len += rec->length;
1081 push the recdb database out to all nodes
1083 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1084 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1086 struct recdb_data params;
1087 struct ctdb_marshall_buffer *recdata;
1089 TALLOC_CTX *tmp_ctx;
1092 tmp_ctx = talloc_new(ctdb);
1093 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1095 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1096 CTDB_NO_MEMORY(ctdb, recdata);
1098 recdata->db_id = dbid;
1101 params.recdata = recdata;
1102 params.len = offsetof(struct ctdb_marshall_buffer, data);
1103 params.failed = false;
1105 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1106 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1107 talloc_free(params.recdata);
1108 talloc_free(tmp_ctx);
1112 if (params.failed) {
1113 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1114 talloc_free(params.recdata);
1115 talloc_free(tmp_ctx);
1119 recdata = params.recdata;
1121 outdata.dptr = (void *)recdata;
1122 outdata.dsize = params.len;
1124 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1125 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1127 CONTROL_TIMEOUT(), false, outdata,
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1131 talloc_free(recdata);
1132 talloc_free(tmp_ctx);
1136 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1137 dbid, recdata->count));
1139 talloc_free(recdata);
1140 talloc_free(tmp_ctx);
1147 go through a full recovery on one database
1149 static int recover_database(struct ctdb_recoverd *rec,
1150 TALLOC_CTX *mem_ctx,
1153 struct ctdb_node_map *nodemap,
1154 uint32_t transaction_id)
1156 struct tdb_wrap *recdb;
1158 struct ctdb_context *ctdb = rec->ctdb;
1160 struct ctdb_control_wipe_database w;
1163 recdb = create_recdb(ctdb, mem_ctx);
1164 if (recdb == NULL) {
1168 /* pull all remote databases onto the recdb */
1169 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1171 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1175 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1177 /* wipe all the remote databases. This is safe as we are in a transaction */
1179 w.transaction_id = transaction_id;
1181 data.dptr = (void *)&w;
1182 data.dsize = sizeof(w);
1184 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1185 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1187 CONTROL_TIMEOUT(), false, data,
1190 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1195 /* push out the correct database. This sets the dmaster and skips
1196 the empty records */
1197 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1203 /* all done with this database */
1210 reload the nodes file
1212 static void reload_nodes_file(struct ctdb_context *ctdb)
1215 ctdb_load_nodes_file(ctdb);
1220 we are the recmaster, and recovery is needed - start a recovery run
1222 static int do_recovery(struct ctdb_recoverd *rec,
1223 TALLOC_CTX *mem_ctx, uint32_t pnn,
1224 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1226 struct ctdb_context *ctdb = rec->ctdb;
1228 uint32_t generation;
1229 struct ctdb_dbid_map *dbmap;
1232 struct timeval start_time;
1234 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1236 /* if recovery fails, force it again */
1237 rec->need_recovery = true;
1239 for (i=0; i<ctdb->num_nodes; i++) {
1240 struct ctdb_banning_state *ban_state;
1242 if (ctdb->nodes[i]->ban_state == NULL) {
1245 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1246 if (ban_state->count < 2*ctdb->num_nodes) {
1249 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1250 ctdb->nodes[i]->pnn, ban_state->count,
1251 ctdb->tunable.recovery_ban_period));
1252 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1253 ban_state->count = 0;
1257 if (ctdb->tunable.verify_recovery_lock != 0) {
1258 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1259 start_time = timeval_current();
1260 if (!ctdb_recovery_lock(ctdb, true)) {
1261 ctdb_set_culprit(rec, pnn);
1262 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1265 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1266 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1269 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1271 /* get a list of all databases */
1272 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1274 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1278 /* we do the db creation before we set the recovery mode, so the freeze happens
1279 on all databases we will be dealing with. */
1281 /* verify that we have all the databases any other node has */
1282 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1284 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1288 /* verify that all other nodes have all our databases */
1289 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1291 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1294 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1296 /* update the database priority for all remote databases */
1297 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1299 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1301 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1304 /* set recovery mode to active on all nodes */
1305 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1311 /* execute the "startrecovery" event script on all nodes */
1312 ret = run_startrecovery_eventscript(rec, nodemap);
1314 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1318 /* pick a new generation number */
1319 generation = new_generation();
1321 /* change the vnnmap on this node to use the new generation
1322 number but not on any other nodes.
1323 this guarantees that if we abort the recovery prematurely
1324 for some reason (a node stops responding?)
1325 that we can just return immediately and we will reenter
1326 recovery shortly again.
1327 I.e. we deliberately leave the cluster with an inconsistent
1328 generation id to allow us to abort recovery at any stage and
1329 just restart it from scratch.
1331 vnnmap->generation = generation;
1332 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1334 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1338 data.dptr = (void *)&generation;
1339 data.dsize = sizeof(uint32_t);
1341 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1342 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1344 CONTROL_TIMEOUT(), false, data,
1346 transaction_start_fail_callback,
1348 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1349 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1351 CONTROL_TIMEOUT(), false, tdb_null,
1355 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1360 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1362 for (i=0;i<dbmap->num;i++) {
1363 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1364 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1369 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1371 /* commit all the changes */
1372 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1374 CONTROL_TIMEOUT(), false, data,
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1381 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1384 /* update the capabilities for all nodes */
1385 ret = update_capabilities(ctdb, nodemap);
1387 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1391 /* build a new vnn map with all the currently active and
1393 generation = new_generation();
1394 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1395 CTDB_NO_MEMORY(ctdb, vnnmap);
1396 vnnmap->generation = generation;
1398 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1399 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1400 for (i=j=0;i<nodemap->num;i++) {
1401 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1404 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1405 /* this node can not be an lmaster */
1406 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1411 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1412 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1413 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1416 if (vnnmap->size == 0) {
1417 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1419 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1420 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1421 vnnmap->map[0] = pnn;
1424 /* update to the new vnnmap on all nodes */
1425 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1427 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1431 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1433 /* update recmaster to point to us for all nodes */
1434 ret = set_recovery_master(ctdb, nodemap, pnn);
1436 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1440 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1443 update all nodes to have the same flags that we have
1445 for (i=0;i<nodemap->num;i++) {
1446 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1450 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1452 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1457 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1459 /* disable recovery mode */
1460 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1462 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1466 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1469 tell nodes to takeover their public IPs
1471 rec->need_takeover_run = false;
1472 ret = ctdb_takeover_run(ctdb, nodemap);
1474 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1477 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1479 /* execute the "recovered" event script on all nodes */
1480 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1482 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1486 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1488 /* send a message to all clients telling them that the cluster
1489 has been reconfigured */
1490 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1492 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1494 rec->need_recovery = false;
1496 /* we managed to complete a full recovery, make sure to forgive
1497 any past sins by the nodes that could now participate in the
1500 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1501 for (i=0;i<nodemap->num;i++) {
1502 struct ctdb_banning_state *ban_state;
1504 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1508 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1509 if (ban_state == NULL) {
1513 ban_state->count = 0;
1517 /* We just finished a recovery successfully.
1518 We now wait for rerecovery_timeout before we allow
1519 another recovery to take place.
1521 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1522 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1523 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1530 elections are won by first checking the number of connected nodes, then
1531 the priority time, then the pnn
1533 struct election_message {
1534 uint32_t num_connected;
1535 struct timeval priority_time;
1537 uint32_t node_flags;
1541 form this nodes election data
1543 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1546 struct ctdb_node_map *nodemap;
1547 struct ctdb_context *ctdb = rec->ctdb;
1551 em->pnn = rec->ctdb->pnn;
1552 em->priority_time = rec->priority_time;
1554 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1556 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1560 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1561 em->node_flags = rec->node_flags;
1563 for (i=0;i<nodemap->num;i++) {
1564 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1565 em->num_connected++;
1569 /* we shouldnt try to win this election if we cant be a recmaster */
1570 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1571 em->num_connected = 0;
1572 em->priority_time = timeval_current();
1575 talloc_free(nodemap);
1579 see if the given election data wins
1581 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1583 struct election_message myem;
1586 ctdb_election_data(rec, &myem);
1588 /* we cant win if we dont have the recmaster capability */
1589 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1593 /* we cant win if we are banned */
1594 if (rec->node_flags & NODE_FLAGS_BANNED) {
1598 /* we cant win if we are stopped */
1599 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1603 /* we will automatically win if the other node is banned */
1604 if (em->node_flags & NODE_FLAGS_BANNED) {
1608 /* we will automatically win if the other node is banned */
1609 if (em->node_flags & NODE_FLAGS_STOPPED) {
1613 /* try to use the most connected node */
1615 cmp = (int)myem.num_connected - (int)em->num_connected;
1618 /* then the longest running node */
1620 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1624 cmp = (int)myem.pnn - (int)em->pnn;
1631 send out an election request
1633 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1636 TDB_DATA election_data;
1637 struct election_message emsg;
1639 struct ctdb_context *ctdb = rec->ctdb;
1641 srvid = CTDB_SRVID_RECOVERY;
1643 ctdb_election_data(rec, &emsg);
1645 election_data.dsize = sizeof(struct election_message);
1646 election_data.dptr = (unsigned char *)&emsg;
1649 /* send an election message to all active nodes */
1650 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1651 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1654 /* A new node that is already frozen has entered the cluster.
1655 The existing nodes are not frozen and dont need to be frozen
1656 until the election has ended and we start the actual recovery
1658 if (update_recmaster == true) {
1659 /* first we assume we will win the election and set
1660 recoverymaster to be ourself on the current node
1662 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1664 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1674 this function will unban all nodes in the cluster
1676 static void unban_all_nodes(struct ctdb_context *ctdb)
1679 struct ctdb_node_map *nodemap;
1680 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1682 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1684 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1688 for (i=0;i<nodemap->num;i++) {
1689 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1690 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1691 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1695 talloc_free(tmp_ctx);
1700 we think we are winning the election - send a broadcast election request
1702 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1704 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1707 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1709 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1712 talloc_free(rec->send_election_te);
1713 rec->send_election_te = NULL;
1717 handler for memory dumps
1719 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1720 TDB_DATA data, void *private_data)
1722 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1725 struct rd_memdump_reply *rd;
1727 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1728 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1729 talloc_free(tmp_ctx);
1732 rd = (struct rd_memdump_reply *)data.dptr;
1734 dump = talloc_zero(tmp_ctx, TDB_DATA);
1736 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1737 talloc_free(tmp_ctx);
1740 ret = ctdb_dump_memory(ctdb, dump);
1742 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1743 talloc_free(tmp_ctx);
1747 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1749 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1751 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1752 talloc_free(tmp_ctx);
1756 talloc_free(tmp_ctx);
1760 handler for reload_nodes
1762 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1763 TDB_DATA data, void *private_data)
1765 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1767 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1769 reload_nodes_file(rec->ctdb);
1773 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1774 struct timeval yt, void *p)
1776 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1778 talloc_free(rec->ip_check_disable_ctx);
1779 rec->ip_check_disable_ctx = NULL;
1782 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1783 TDB_DATA data, void *private_data)
1785 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1788 if (rec->ip_check_disable_ctx != NULL) {
1789 talloc_free(rec->ip_check_disable_ctx);
1790 rec->ip_check_disable_ctx = NULL;
1793 if (data.dsize != sizeof(uint32_t)) {
1794 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1795 "expexting %lu\n", (long unsigned)data.dsize,
1796 (long unsigned)sizeof(uint32_t)));
1799 if (data.dptr == NULL) {
1800 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1804 timeout = *((uint32_t *)data.dptr);
1805 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1807 rec->ip_check_disable_ctx = talloc_new(rec);
1808 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1810 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1815 handler for ip reallocate, just add it to the list of callers and
1816 handle this later in the monitor_cluster loop so we do not recurse
1817 with other callers to takeover_run()
1819 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1820 TDB_DATA data, void *private_data)
1822 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1823 struct ip_reallocate_list *caller;
1825 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1826 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1830 if (rec->ip_reallocate_ctx == NULL) {
1831 rec->ip_reallocate_ctx = talloc_new(rec);
1832 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
1835 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1836 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1838 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1839 caller->next = rec->reallocate_callers;
1840 rec->reallocate_callers = caller;
1845 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1847 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1850 struct ip_reallocate_list *callers;
1852 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1853 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1854 result.dsize = sizeof(int32_t);
1855 result.dptr = (uint8_t *)&ret;
1857 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1858 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
1859 "%u:%lu\n", (unsigned)callers->rd->pnn,
1860 (long unsigned)callers->rd->srvid));
1861 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1863 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
1864 "message to %u:%lu\n",
1865 (unsigned)callers->rd->pnn,
1866 (long unsigned)callers->rd->srvid));
1870 talloc_free(tmp_ctx);
1871 talloc_free(rec->ip_reallocate_ctx);
1872 rec->ip_reallocate_ctx = NULL;
1873 rec->reallocate_callers = NULL;
1879 handler for recovery master elections
1881 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1882 TDB_DATA data, void *private_data)
1884 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1886 struct election_message *em = (struct election_message *)data.dptr;
1887 TALLOC_CTX *mem_ctx;
1889 /* we got an election packet - update the timeout for the election */
1890 talloc_free(rec->election_timeout);
1891 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1892 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1893 ctdb_election_timeout, rec);
1895 mem_ctx = talloc_new(ctdb);
1897 /* someone called an election. check their election data
1898 and if we disagree and we would rather be the elected node,
1899 send a new election message to all other nodes
1901 if (ctdb_election_win(rec, em)) {
1902 if (!rec->send_election_te) {
1903 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1904 timeval_current_ofs(0, 500000),
1905 election_send_request, rec);
1907 talloc_free(mem_ctx);
1908 /*unban_all_nodes(ctdb);*/
1913 talloc_free(rec->send_election_te);
1914 rec->send_election_te = NULL;
1916 if (ctdb->tunable.verify_recovery_lock != 0) {
1917 /* release the recmaster lock */
1918 if (em->pnn != ctdb->pnn &&
1919 ctdb->recovery_lock_fd != -1) {
1920 close(ctdb->recovery_lock_fd);
1921 ctdb->recovery_lock_fd = -1;
1922 unban_all_nodes(ctdb);
1926 /* ok, let that guy become recmaster then */
1927 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1929 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1930 talloc_free(mem_ctx);
1934 talloc_free(mem_ctx);
1940 force the start of the election process
1942 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1943 struct ctdb_node_map *nodemap)
1946 struct ctdb_context *ctdb = rec->ctdb;
1948 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1950 /* set all nodes to recovery mode to stop all internode traffic */
1951 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1953 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1957 talloc_free(rec->election_timeout);
1958 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1959 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1960 ctdb_election_timeout, rec);
1962 ret = send_election_request(rec, pnn, true);
1964 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1968 /* wait for a few seconds to collect all responses */
1969 ctdb_wait_election(rec);
1975 handler for when a node changes its flags
1977 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1978 TDB_DATA data, void *private_data)
1981 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1982 struct ctdb_node_map *nodemap=NULL;
1983 TALLOC_CTX *tmp_ctx;
1984 uint32_t changed_flags;
1986 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1987 int disabled_flag_changed;
1989 if (data.dsize != sizeof(*c)) {
1990 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1994 tmp_ctx = talloc_new(ctdb);
1995 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1997 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1999 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2000 talloc_free(tmp_ctx);
2005 for (i=0;i<nodemap->num;i++) {
2006 if (nodemap->nodes[i].pnn == c->pnn) break;
2009 if (i == nodemap->num) {
2010 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2011 talloc_free(tmp_ctx);
2015 changed_flags = c->old_flags ^ c->new_flags;
2017 if (nodemap->nodes[i].flags != c->new_flags) {
2018 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2021 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2023 nodemap->nodes[i].flags = c->new_flags;
2025 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2026 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2029 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2030 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2034 ctdb->recovery_master == ctdb->pnn &&
2035 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2036 /* Only do the takeover run if the perm disabled or unhealthy
2037 flags changed since these will cause an ip failover but not
2039 If the node became disconnected or banned this will also
2040 lead to an ip address failover but that is handled
2043 if (disabled_flag_changed) {
2044 rec->need_takeover_run = true;
2048 talloc_free(tmp_ctx);
2052 handler for when we need to push out flag changes ot all other nodes
2054 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2055 TDB_DATA data, void *private_data)
2058 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2060 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2062 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2067 struct verify_recmode_normal_data {
2069 enum monitor_result status;
2072 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2074 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2077 /* one more node has responded with recmode data*/
2080 /* if we failed to get the recmode, then return an error and let
2081 the main loop try again.
2083 if (state->state != CTDB_CONTROL_DONE) {
2084 if (rmdata->status == MONITOR_OK) {
2085 rmdata->status = MONITOR_FAILED;
2090 /* if we got a response, then the recmode will be stored in the
2093 if (state->status != CTDB_RECOVERY_NORMAL) {
2094 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2095 rmdata->status = MONITOR_RECOVERY_NEEDED;
2102 /* verify that all nodes are in normal recovery mode */
2103 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2105 struct verify_recmode_normal_data *rmdata;
2106 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2107 struct ctdb_client_control_state *state;
2108 enum monitor_result status;
2111 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2112 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2114 rmdata->status = MONITOR_OK;
2116 /* loop over all active nodes and send an async getrecmode call to
2118 for (j=0; j<nodemap->num; j++) {
2119 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2122 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2124 nodemap->nodes[j].pnn);
2125 if (state == NULL) {
2126 /* we failed to send the control, treat this as
2127 an error and try again next iteration
2129 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2130 talloc_free(mem_ctx);
2131 return MONITOR_FAILED;
2134 /* set up the callback functions */
2135 state->async.fn = verify_recmode_normal_callback;
2136 state->async.private_data = rmdata;
2138 /* one more control to wait for to complete */
2143 /* now wait for up to the maximum number of seconds allowed
2144 or until all nodes we expect a response from has replied
2146 while (rmdata->count > 0) {
2147 event_loop_once(ctdb->ev);
2150 status = rmdata->status;
2151 talloc_free(mem_ctx);
2156 struct verify_recmaster_data {
2157 struct ctdb_recoverd *rec;
2160 enum monitor_result status;
2163 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2165 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2168 /* one more node has responded with recmaster data*/
2171 /* if we failed to get the recmaster, then return an error and let
2172 the main loop try again.
2174 if (state->state != CTDB_CONTROL_DONE) {
2175 if (rmdata->status == MONITOR_OK) {
2176 rmdata->status = MONITOR_FAILED;
2181 /* if we got a response, then the recmaster will be stored in the
2184 if (state->status != rmdata->pnn) {
2185 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2186 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2187 rmdata->status = MONITOR_ELECTION_NEEDED;
2194 /* verify that all nodes agree that we are the recmaster */
2195 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2197 struct ctdb_context *ctdb = rec->ctdb;
2198 struct verify_recmaster_data *rmdata;
2199 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2200 struct ctdb_client_control_state *state;
2201 enum monitor_result status;
2204 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2205 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2209 rmdata->status = MONITOR_OK;
2211 /* loop over all active nodes and send an async getrecmaster call to
2213 for (j=0; j<nodemap->num; j++) {
2214 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2217 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2219 nodemap->nodes[j].pnn);
2220 if (state == NULL) {
2221 /* we failed to send the control, treat this as
2222 an error and try again next iteration
2224 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2225 talloc_free(mem_ctx);
2226 return MONITOR_FAILED;
2229 /* set up the callback functions */
2230 state->async.fn = verify_recmaster_callback;
2231 state->async.private_data = rmdata;
2233 /* one more control to wait for to complete */
2238 /* now wait for up to the maximum number of seconds allowed
2239 or until all nodes we expect a response from has replied
2241 while (rmdata->count > 0) {
2242 event_loop_once(ctdb->ev);
2245 status = rmdata->status;
2246 talloc_free(mem_ctx);
2251 /* called to check that the allocation of public ip addresses is ok.
2253 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2255 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2256 struct ctdb_all_public_ips *ips = NULL;
2257 struct ctdb_uptime *uptime1 = NULL;
2258 struct ctdb_uptime *uptime2 = NULL;
2261 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2262 CTDB_CURRENT_NODE, &uptime1);
2264 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2265 talloc_free(mem_ctx);
2269 /* read the ip allocation from the local node */
2270 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2272 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2273 talloc_free(mem_ctx);
2277 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2278 CTDB_CURRENT_NODE, &uptime2);
2280 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2281 talloc_free(mem_ctx);
2285 /* skip the check if the startrecovery time has changed */
2286 if (timeval_compare(&uptime1->last_recovery_started,
2287 &uptime2->last_recovery_started) != 0) {
2288 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2289 talloc_free(mem_ctx);
2293 /* skip the check if the endrecovery time has changed */
2294 if (timeval_compare(&uptime1->last_recovery_finished,
2295 &uptime2->last_recovery_finished) != 0) {
2296 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2297 talloc_free(mem_ctx);
2301 /* skip the check if we have started but not finished recovery */
2302 if (timeval_compare(&uptime1->last_recovery_finished,
2303 &uptime1->last_recovery_started) != 1) {
2304 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2305 talloc_free(mem_ctx);
2310 /* verify that we have the ip addresses we should have
2311 and we dont have ones we shouldnt have.
2312 if we find an inconsistency we set recmode to
2313 active on the local node and wait for the recmaster
2314 to do a full blown recovery
2316 for (j=0; j<ips->num; j++) {
2317 if (ips->ips[j].pnn == pnn) {
2318 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2319 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2320 ctdb_addr_to_str(&ips->ips[j].addr)));
2321 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2323 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2325 talloc_free(mem_ctx);
2328 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2330 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2332 talloc_free(mem_ctx);
2337 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2338 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2339 ctdb_addr_to_str(&ips->ips[j].addr)));
2341 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2343 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2345 talloc_free(mem_ctx);
2348 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2350 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2352 talloc_free(mem_ctx);
2359 talloc_free(mem_ctx);
2364 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2366 struct ctdb_node_map **remote_nodemaps = callback_data;
2368 if (node_pnn >= ctdb->num_nodes) {
2369 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2373 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2377 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2378 struct ctdb_node_map *nodemap,
2379 struct ctdb_node_map **remote_nodemaps)
2383 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2384 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2386 CONTROL_TIMEOUT(), false, tdb_null,
2387 async_getnodemap_callback,
2389 remote_nodemaps) != 0) {
2390 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2398 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2399 struct ctdb_check_reclock_state {
2400 struct ctdb_context *ctdb;
2401 struct timeval start_time;
2404 struct timed_event *te;
2405 struct fd_event *fde;
2406 enum reclock_child_status status;
2409 /* when we free the reclock state we must kill any child process.
2411 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2413 struct ctdb_context *ctdb = state->ctdb;
2415 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2417 if (state->fd[0] != -1) {
2418 close(state->fd[0]);
2421 if (state->fd[1] != -1) {
2422 close(state->fd[1]);
2425 kill(state->child, SIGKILL);
2430 called if our check_reclock child times out. this would happen if
2431 i/o to the reclock file blocks.
2433 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2434 struct timeval t, void *private_data)
2436 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2437 struct ctdb_check_reclock_state);
2439 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2440 state->status = RECLOCK_TIMEOUT;
2443 /* this is called when the child process has completed checking the reclock
2444 file and has written data back to us through the pipe.
2446 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2447 uint16_t flags, void *private_data)
2449 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2450 struct ctdb_check_reclock_state);
2454 /* we got a response from our child process so we can abort the
2457 talloc_free(state->te);
2460 ret = read(state->fd[0], &c, 1);
2461 if (ret != 1 || c != RECLOCK_OK) {
2462 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2463 state->status = RECLOCK_FAILED;
2468 state->status = RECLOCK_OK;
2472 static int check_recovery_lock(struct ctdb_context *ctdb)
2475 struct ctdb_check_reclock_state *state;
2476 pid_t parent = getpid();
2478 if (ctdb->recovery_lock_fd == -1) {
2479 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2483 state = talloc(ctdb, struct ctdb_check_reclock_state);
2484 CTDB_NO_MEMORY(ctdb, state);
2487 state->start_time = timeval_current();
2488 state->status = RECLOCK_CHECKING;
2492 ret = pipe(state->fd);
2495 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2499 state->child = fork();
2500 if (state->child == (pid_t)-1) {
2501 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2502 close(state->fd[0]);
2504 close(state->fd[1]);
2510 if (state->child == 0) {
2511 char cc = RECLOCK_OK;
2512 close(state->fd[0]);
2515 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2516 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2517 cc = RECLOCK_FAILED;
2520 write(state->fd[1], &cc, 1);
2521 /* make sure we die when our parent dies */
2522 while (kill(parent, 0) == 0 || errno != ESRCH) {
2524 write(state->fd[1], &cc, 1);
2528 close(state->fd[1]);
2530 set_close_on_exec(state->fd[0]);
2532 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2534 talloc_set_destructor(state, check_reclock_destructor);
2536 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2537 ctdb_check_reclock_timeout, state);
2538 if (state->te == NULL) {
2539 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2544 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2545 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2546 reclock_child_handler,
2549 if (state->fde == NULL) {
2550 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2555 while (state->status == RECLOCK_CHECKING) {
2556 event_loop_once(ctdb->ev);
2559 if (state->status == RECLOCK_FAILED) {
2560 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2561 close(ctdb->recovery_lock_fd);
2562 ctdb->recovery_lock_fd = -1;
2571 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2573 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2574 const char *reclockfile;
2576 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2577 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2578 talloc_free(tmp_ctx);
2582 if (reclockfile == NULL) {
2583 if (ctdb->recovery_lock_file != NULL) {
2584 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2585 talloc_free(ctdb->recovery_lock_file);
2586 ctdb->recovery_lock_file = NULL;
2587 if (ctdb->recovery_lock_fd != -1) {
2588 close(ctdb->recovery_lock_fd);
2589 ctdb->recovery_lock_fd = -1;
2592 ctdb->tunable.verify_recovery_lock = 0;
2593 talloc_free(tmp_ctx);
2597 if (ctdb->recovery_lock_file == NULL) {
2598 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2599 if (ctdb->recovery_lock_fd != -1) {
2600 close(ctdb->recovery_lock_fd);
2601 ctdb->recovery_lock_fd = -1;
2603 talloc_free(tmp_ctx);
2608 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2609 talloc_free(tmp_ctx);
2613 talloc_free(ctdb->recovery_lock_file);
2614 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2615 ctdb->tunable.verify_recovery_lock = 0;
2616 if (ctdb->recovery_lock_fd != -1) {
2617 close(ctdb->recovery_lock_fd);
2618 ctdb->recovery_lock_fd = -1;
2621 talloc_free(tmp_ctx);
2626 the main monitoring loop
2628 static void monitor_cluster(struct ctdb_context *ctdb)
2631 TALLOC_CTX *mem_ctx=NULL;
2632 struct ctdb_node_map *nodemap=NULL;
2633 struct ctdb_node_map *recmaster_nodemap=NULL;
2634 struct ctdb_node_map **remote_nodemaps=NULL;
2635 struct ctdb_vnn_map *vnnmap=NULL;
2636 struct ctdb_vnn_map *remote_vnnmap=NULL;
2637 int32_t debug_level;
2639 struct ctdb_recoverd *rec;
2641 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2643 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2644 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2648 rec->priority_time = timeval_current();
2650 /* register a message port for sending memory dumps */
2651 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2653 /* register a message port for recovery elections */
2654 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2656 /* when nodes are disabled/enabled */
2657 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2659 /* when we are asked to puch out a flag change */
2660 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2662 /* register a message port for vacuum fetch */
2663 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2665 /* register a message port for reloadnodes */
2666 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2668 /* register a message port for performing a takeover run */
2669 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2671 /* register a message port for disabling the ip check for a short while */
2672 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2676 talloc_free(mem_ctx);
2679 mem_ctx = talloc_new(ctdb);
2681 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2685 /* we only check for recovery once every second */
2686 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2688 /* verify that the main daemon is still running */
2689 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2690 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2694 /* ping the local daemon to tell it we are alive */
2695 ctdb_ctrl_recd_ping(ctdb);
2697 if (rec->election_timeout) {
2698 /* an election is in progress */
2702 /* read the debug level from the parent and update locally */
2703 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2705 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2708 LogLevel = debug_level;
2711 /* We must check if we need to ban a node here but we want to do this
2712 as early as possible so we dont wait until we have pulled the node
2713 map from the local node. thats why we have the hardcoded value 20
2715 for (i=0; i<ctdb->num_nodes; i++) {
2716 struct ctdb_banning_state *ban_state;
2718 if (ctdb->nodes[i]->ban_state == NULL) {
2721 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2722 if (ban_state->count < 20) {
2725 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2726 ctdb->nodes[i]->pnn, ban_state->count,
2727 ctdb->tunable.recovery_ban_period));
2728 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2729 ban_state->count = 0;
2732 /* get relevant tunables */
2733 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2735 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2739 /* get the current recovery lock file from the server */
2740 if (update_recovery_lock_file(ctdb) != 0) {
2741 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2745 /* Make sure that if recovery lock verification becomes disabled when
2748 if (ctdb->tunable.verify_recovery_lock == 0) {
2749 if (ctdb->recovery_lock_fd != -1) {
2750 close(ctdb->recovery_lock_fd);
2751 ctdb->recovery_lock_fd = -1;
2755 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2756 if (pnn == (uint32_t)-1) {
2757 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2761 /* get the vnnmap */
2762 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2764 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2769 /* get number of nodes */
2771 talloc_free(rec->nodemap);
2772 rec->nodemap = NULL;
2775 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2777 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2780 nodemap = rec->nodemap;
2782 /* check which node is the recovery master */
2783 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2785 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2789 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2790 if (rec->recmaster != pnn) {
2791 if (rec->ip_reallocate_ctx != NULL) {
2792 talloc_free(rec->ip_reallocate_ctx);
2793 rec->ip_reallocate_ctx = NULL;
2794 rec->reallocate_callers = NULL;
2797 /* if there are takeovers requested, perform it and notify the waiters */
2798 if (rec->reallocate_callers) {
2799 process_ipreallocate_requests(ctdb, rec);
2802 if (rec->recmaster == (uint32_t)-1) {
2803 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2804 force_election(rec, pnn, nodemap);
2809 /* if the local daemon is STOPPED, we verify that the databases are
2810 also frozen and thet the recmode is set to active
2812 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2813 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2815 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2817 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2818 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2820 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2822 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2825 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2827 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2834 /* If the local node is stopped, verify we are not the recmaster
2835 and yield this role if so
2837 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2838 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2839 force_election(rec, pnn, nodemap);
2843 /* check that we (recovery daemon) and the local ctdb daemon
2844 agrees on whether we are banned or not
2848 /* remember our own node flags */
2849 rec->node_flags = nodemap->nodes[pnn].flags;
2851 /* count how many active nodes there are */
2852 rec->num_active = 0;
2853 rec->num_connected = 0;
2854 for (i=0; i<nodemap->num; i++) {
2855 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2858 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2859 rec->num_connected++;
2864 /* verify that the recmaster node is still active */
2865 for (j=0; j<nodemap->num; j++) {
2866 if (nodemap->nodes[j].pnn==rec->recmaster) {
2871 if (j == nodemap->num) {
2872 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2873 force_election(rec, pnn, nodemap);
2877 /* if recovery master is disconnected we must elect a new recmaster */
2878 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2879 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2880 force_election(rec, pnn, nodemap);
2884 /* grap the nodemap from the recovery master to check if it is banned */
2885 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2886 mem_ctx, &recmaster_nodemap);
2888 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2889 nodemap->nodes[j].pnn));
2894 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2895 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2896 force_election(rec, pnn, nodemap);
2901 /* verify that we have all ip addresses we should have and we dont
2902 * have addresses we shouldnt have.
2904 if (ctdb->do_checkpublicip) {
2905 if (rec->ip_check_disable_ctx == NULL) {
2906 if (verify_ip_allocation(ctdb, pnn) != 0) {
2907 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2914 /* if we are not the recmaster then we do not need to check
2915 if recovery is needed
2917 if (pnn != rec->recmaster) {
2922 /* ensure our local copies of flags are right */
2923 ret = update_local_flags(rec, nodemap);
2924 if (ret == MONITOR_ELECTION_NEEDED) {
2925 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2926 force_election(rec, pnn, nodemap);
2929 if (ret != MONITOR_OK) {
2930 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2934 /* update the list of public ips that a node can handle for
2937 if (ctdb->num_nodes != nodemap->num) {
2938 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2939 reload_nodes_file(ctdb);
2942 for (j=0; j<nodemap->num; j++) {
2943 /* release any existing data */
2944 if (ctdb->nodes[j]->public_ips) {
2945 talloc_free(ctdb->nodes[j]->public_ips);
2946 ctdb->nodes[j]->public_ips = NULL;
2949 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2953 /* grab a new shiny list of public ips from the node */
2954 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2955 ctdb->nodes[j]->pnn,
2957 &ctdb->nodes[j]->public_ips)) {
2958 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2959 ctdb->nodes[j]->pnn));
2965 /* verify that all active nodes agree that we are the recmaster */
2966 switch (verify_recmaster(rec, nodemap, pnn)) {
2967 case MONITOR_RECOVERY_NEEDED:
2968 /* can not happen */
2970 case MONITOR_ELECTION_NEEDED:
2971 force_election(rec, pnn, nodemap);
2975 case MONITOR_FAILED:
2980 if (rec->need_recovery) {
2981 /* a previous recovery didn't finish */
2982 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2986 /* verify that all active nodes are in normal mode
2987 and not in recovery mode
2989 switch (verify_recmode(ctdb, nodemap)) {
2990 case MONITOR_RECOVERY_NEEDED:
2991 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2993 case MONITOR_FAILED:
2995 case MONITOR_ELECTION_NEEDED:
2996 /* can not happen */
3002 if (ctdb->tunable.verify_recovery_lock != 0) {
3003 /* we should have the reclock - check its not stale */
3004 ret = check_recovery_lock(ctdb);
3006 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3007 ctdb_set_culprit(rec, ctdb->pnn);
3008 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3013 /* get the nodemap for all active remote nodes
3015 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3016 if (remote_nodemaps == NULL) {
3017 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3020 for(i=0; i<nodemap->num; i++) {
3021 remote_nodemaps[i] = NULL;
3023 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3024 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3028 /* verify that all other nodes have the same nodemap as we have
3030 for (j=0; j<nodemap->num; j++) {
3031 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3035 if (remote_nodemaps[j] == NULL) {
3036 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3037 ctdb_set_culprit(rec, j);
3042 /* if the nodes disagree on how many nodes there are
3043 then this is a good reason to try recovery
3045 if (remote_nodemaps[j]->num != nodemap->num) {
3046 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3047 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3048 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3049 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3053 /* if the nodes disagree on which nodes exist and are
3054 active, then that is also a good reason to do recovery
3056 for (i=0;i<nodemap->num;i++) {
3057 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3058 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3059 nodemap->nodes[j].pnn, i,
3060 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3061 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3062 do_recovery(rec, mem_ctx, pnn, nodemap,
3068 /* verify the flags are consistent
3070 for (i=0; i<nodemap->num; i++) {
3071 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3075 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3076 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3077 nodemap->nodes[j].pnn,
3078 nodemap->nodes[i].pnn,
3079 remote_nodemaps[j]->nodes[i].flags,
3080 nodemap->nodes[j].flags));
3082 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3083 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3084 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3085 do_recovery(rec, mem_ctx, pnn, nodemap,
3089 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3090 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3091 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3092 do_recovery(rec, mem_ctx, pnn, nodemap,
3101 /* there better be the same number of lmasters in the vnn map
3102 as there are active nodes or we will have to do a recovery
3104 if (vnnmap->size != rec->num_active) {
3105 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3106 vnnmap->size, rec->num_active));
3107 ctdb_set_culprit(rec, ctdb->pnn);
3108 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3112 /* verify that all active nodes in the nodemap also exist in
3115 for (j=0; j<nodemap->num; j++) {
3116 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3119 if (nodemap->nodes[j].pnn == pnn) {
3123 for (i=0; i<vnnmap->size; i++) {
3124 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3128 if (i == vnnmap->size) {
3129 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3130 nodemap->nodes[j].pnn));
3131 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3132 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3138 /* verify that all other nodes have the same vnnmap
3139 and are from the same generation
3141 for (j=0; j<nodemap->num; j++) {
3142 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3145 if (nodemap->nodes[j].pnn == pnn) {
3149 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3150 mem_ctx, &remote_vnnmap);
3152 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3153 nodemap->nodes[j].pnn));
3157 /* verify the vnnmap generation is the same */
3158 if (vnnmap->generation != remote_vnnmap->generation) {
3159 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3160 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3161 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3162 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3166 /* verify the vnnmap size is the same */
3167 if (vnnmap->size != remote_vnnmap->size) {
3168 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3169 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3170 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3171 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3175 /* verify the vnnmap is the same */
3176 for (i=0;i<vnnmap->size;i++) {
3177 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3178 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3179 nodemap->nodes[j].pnn));
3180 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3181 do_recovery(rec, mem_ctx, pnn, nodemap,
3188 /* we might need to change who has what IP assigned */
3189 if (rec->need_takeover_run) {
3190 rec->need_takeover_run = false;
3192 /* execute the "startrecovery" event script on all nodes */
3193 ret = run_startrecovery_eventscript(rec, nodemap);
3195 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3196 ctdb_set_culprit(rec, ctdb->pnn);
3197 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3200 ret = ctdb_takeover_run(ctdb, nodemap);
3202 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3203 ctdb_set_culprit(rec, ctdb->pnn);
3204 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3207 /* execute the "recovered" event script on all nodes */
3208 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3210 // we cant check whether the event completed successfully
3211 // since this script WILL fail if the node is in recovery mode
3212 // and if that race happens, the code here would just cause a second
3213 // cascading recovery.
3215 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3216 ctdb_set_culprit(rec, ctdb->pnn);
3217 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3228 event handler for when the main ctdbd dies
3230 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3231 uint16_t flags, void *private_data)
3233 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3238 called regularly to verify that the recovery daemon is still running
3240 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3241 struct timeval yt, void *p)
3243 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3245 if (kill(ctdb->recoverd_pid, 0) != 0) {
3246 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3248 ctdb_stop_recoverd(ctdb);
3249 ctdb_stop_keepalive(ctdb);
3250 ctdb_stop_monitoring(ctdb);
3251 ctdb_release_all_ips(ctdb);
3252 if (ctdb->methods != NULL) {
3253 ctdb->methods->shutdown(ctdb);
3255 ctdb_event_script(ctdb, "shutdown");
3260 event_add_timed(ctdb->ev, ctdb,
3261 timeval_current_ofs(30, 0),
3262 ctdb_check_recd, ctdb);
3265 static void recd_sig_child_handler(struct event_context *ev,
3266 struct signal_event *se, int signum, int count,
3270 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3275 pid = waitpid(-1, &status, WNOHANG);
3277 if (errno != ECHILD) {
3278 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3283 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3289 startup the recovery daemon as a child of the main ctdb daemon
3291 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3294 struct signal_event *se;
3296 if (pipe(fd) != 0) {
3300 ctdb->ctdbd_pid = getpid();
3302 ctdb->recoverd_pid = fork();
3303 if (ctdb->recoverd_pid == -1) {
3307 if (ctdb->recoverd_pid != 0) {
3309 event_add_timed(ctdb->ev, ctdb,
3310 timeval_current_ofs(30, 0),
3311 ctdb_check_recd, ctdb);
3317 srandom(getpid() ^ time(NULL));
3319 if (switch_from_server_to_client(ctdb) != 0) {
3320 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3324 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3326 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3327 ctdb_recoverd_parent, &fd[0]);
3329 /* set up a handler to pick up sigchld */
3330 se = event_add_signal(ctdb->ev, ctdb,
3332 recd_sig_child_handler,
3335 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3339 monitor_cluster(ctdb);
3341 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3346 shutdown the recovery daemon
3348 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3350 if (ctdb->recoverd_pid == 0) {
3354 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3355 kill(ctdb->recoverd_pid, SIGTERM);