4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 ban a node for a period of time
76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
79 struct ctdb_context *ctdb = rec->ctdb;
80 struct ctdb_ban_time bantime;
82 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
84 if (!ctdb_validate_pnn(ctdb, pnn)) {
85 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
90 bantime.time = ban_time;
92 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
94 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
104 run the "recovered" eventscript on all nodes
106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
111 tmp_ctx = talloc_new(ctdb);
112 CTDB_NO_MEMORY(ctdb, tmp_ctx);
114 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
115 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
117 CONTROL_TIMEOUT(), false, tdb_null,
120 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
122 talloc_free(tmp_ctx);
126 talloc_free(tmp_ctx);
131 remember the trouble maker
133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
135 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
136 struct ctdb_banning_state *ban_state;
138 if (culprit > ctdb->num_nodes) {
139 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
143 if (ctdb->nodes[culprit]->ban_state == NULL) {
144 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
145 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
149 ban_state = ctdb->nodes[culprit]->ban_state;
150 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
151 /* this was the first time in a long while this node
152 misbehaved so we will forgive any old transgressions.
154 ban_state->count = 0;
157 ban_state->count += count;
158 ban_state->last_reported_time = timeval_current();
159 rec->last_culprit_node = culprit;
163 remember the trouble maker
165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
167 ctdb_set_culprit_count(rec, culprit, 1);
171 /* this callback is called for every node that failed to execute the
174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
176 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
178 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
180 ctdb_set_culprit(rec, node_pnn);
184 run the "startrecovery" eventscript on all nodes
186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
190 struct ctdb_context *ctdb = rec->ctdb;
192 tmp_ctx = talloc_new(ctdb);
193 CTDB_NO_MEMORY(ctdb, tmp_ctx);
195 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
196 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
198 CONTROL_TIMEOUT(), false, tdb_null,
200 startrecovery_fail_callback,
202 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
203 talloc_free(tmp_ctx);
207 talloc_free(tmp_ctx);
211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
213 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
214 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
217 if (node_pnn < ctdb->num_nodes) {
218 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 update the node capabilities for all connected nodes
225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
230 tmp_ctx = talloc_new(ctdb);
231 CTDB_NO_MEMORY(ctdb, tmp_ctx);
233 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
234 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
238 async_getcap_callback, NULL,
240 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
241 talloc_free(tmp_ctx);
245 talloc_free(tmp_ctx);
249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
251 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
253 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
254 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
259 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
261 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
262 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
266 change recovery mode on all nodes
268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
274 tmp_ctx = talloc_new(ctdb);
275 CTDB_NO_MEMORY(ctdb, tmp_ctx);
277 /* freeze all nodes */
278 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
279 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
282 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
283 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
288 set_recmode_fail_callback,
290 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
291 talloc_free(tmp_ctx);
298 data.dsize = sizeof(uint32_t);
299 data.dptr = (unsigned char *)&rec_mode;
301 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
308 talloc_free(tmp_ctx);
312 talloc_free(tmp_ctx);
317 change recovery master on all node
319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
325 tmp_ctx = talloc_new(ctdb);
326 CTDB_NO_MEMORY(ctdb, tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&pnn;
331 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
334 CONTROL_TIMEOUT(), false, data,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
346 /* update all remote nodes to use the same db priority that we have
347 this can fail if the remove node has not yet been upgraded to
348 support this function, so we always return success and never fail
349 a recovery if this call fails.
351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
352 struct ctdb_node_map *nodemap,
353 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
358 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
360 /* step through all local databases */
361 for (db=0; db<dbmap->num;db++) {
363 struct ctdb_db_priority db_prio;
366 db_prio.db_id = dbmap->dbs[db].dbid;
367 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
369 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
373 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
375 data.dptr = (uint8_t *)&db_prio;
376 data.dsize = sizeof(db_prio);
378 if (ctdb_client_async_control(ctdb,
379 CTDB_CONTROL_SET_DB_PRIORITY,
381 CONTROL_TIMEOUT(), false, data,
384 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
392 ensure all other nodes have attached to any databases that we have
394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
395 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
398 struct ctdb_dbid_map *remote_dbmap;
400 /* verify that all other nodes have all our databases */
401 for (j=0; j<nodemap->num; j++) {
402 /* we dont need to ourself ourselves */
403 if (nodemap->nodes[j].pnn == pnn) {
406 /* dont check nodes that are unavailable */
407 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
411 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
412 mem_ctx, &remote_dbmap);
414 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
418 /* step through all local databases */
419 for (db=0; db<dbmap->num;db++) {
423 for (i=0;i<remote_dbmap->num;i++) {
424 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
428 /* the remote node already have this database */
429 if (i!=remote_dbmap->num) {
432 /* ok so we need to create this database */
433 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
436 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
439 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
440 mem_ctx, name, dbmap->dbs[db].persistent);
442 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
453 ensure we are attached to any databases that anyone else is attached to
455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
456 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
459 struct ctdb_dbid_map *remote_dbmap;
461 /* verify that we have all database any other node has */
462 for (j=0; j<nodemap->num; j++) {
463 /* we dont need to ourself ourselves */
464 if (nodemap->nodes[j].pnn == pnn) {
467 /* dont check nodes that are unavailable */
468 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
472 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
473 mem_ctx, &remote_dbmap);
475 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
479 /* step through all databases on the remote node */
480 for (db=0; db<remote_dbmap->num;db++) {
483 for (i=0;i<(*dbmap)->num;i++) {
484 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
488 /* we already have this db locally */
489 if (i!=(*dbmap)->num) {
492 /* ok so we need to create this database and
495 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
496 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
498 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
499 nodemap->nodes[j].pnn));
502 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
503 remote_dbmap->dbs[db].persistent);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
508 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
521 pull the remote database contents from one node into the recdb
523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
524 struct tdb_wrap *recdb, uint32_t dbid)
528 struct ctdb_marshall_buffer *reply;
529 struct ctdb_rec_data *rec;
531 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
533 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
534 CONTROL_TIMEOUT(), &outdata);
536 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
537 talloc_free(tmp_ctx);
541 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
543 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
544 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
545 talloc_free(tmp_ctx);
549 rec = (struct ctdb_rec_data *)&reply->data[0];
553 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
555 struct ctdb_ltdb_header *hdr;
558 key.dptr = &rec->data[0];
559 key.dsize = rec->keylen;
560 data.dptr = &rec->data[key.dsize];
561 data.dsize = rec->datalen;
563 hdr = (struct ctdb_ltdb_header *)data.dptr;
565 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
566 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
567 talloc_free(tmp_ctx);
571 /* fetch the existing record, if any */
572 existing = tdb_fetch(recdb->tdb, key);
574 if (existing.dptr != NULL) {
575 struct ctdb_ltdb_header header;
576 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
577 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
578 (unsigned)existing.dsize, srcnode));
580 talloc_free(tmp_ctx);
583 header = *(struct ctdb_ltdb_header *)existing.dptr;
585 if (!(header.rsn < hdr->rsn ||
586 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
591 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
592 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
593 talloc_free(tmp_ctx);
598 talloc_free(tmp_ctx);
604 pull all the remote database contents into the recdb
606 static int pull_remote_database(struct ctdb_context *ctdb,
607 struct ctdb_recoverd *rec,
608 struct ctdb_node_map *nodemap,
609 struct tdb_wrap *recdb, uint32_t dbid)
613 /* pull all records from all other nodes across onto this node
614 (this merges based on rsn)
616 for (j=0; j<nodemap->num; j++) {
617 /* dont merge from nodes that are unavailable */
618 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
621 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
622 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
623 nodemap->nodes[j].pnn));
624 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
634 update flags on all active nodes
636 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
640 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
642 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
650 ensure all nodes have the same vnnmap we do
652 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
653 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
657 /* push the new vnn map out to all the nodes */
658 for (j=0; j<nodemap->num; j++) {
659 /* dont push to nodes that are unavailable */
660 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
664 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
666 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
676 struct vacuum_info *next, *prev;
677 struct ctdb_recoverd *rec;
679 struct ctdb_db_context *ctdb_db;
680 struct ctdb_marshall_buffer *recs;
681 struct ctdb_rec_data *r;
684 static void vacuum_fetch_next(struct vacuum_info *v);
687 called when a vacuum fetch has completed - just free it and do the next one
689 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
691 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
693 vacuum_fetch_next(v);
698 process the next element from the vacuum list
700 static void vacuum_fetch_next(struct vacuum_info *v)
702 struct ctdb_call call;
703 struct ctdb_rec_data *r;
705 while (v->recs->count) {
706 struct ctdb_client_call_state *state;
708 struct ctdb_ltdb_header *hdr;
711 call.call_id = CTDB_NULL_FUNC;
712 call.flags = CTDB_IMMEDIATE_MIGRATION;
715 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
718 call.key.dptr = &r->data[0];
719 call.key.dsize = r->keylen;
721 /* ensure we don't block this daemon - just skip a record if we can't get
723 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
727 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
728 if (data.dptr == NULL) {
729 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
733 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
739 hdr = (struct ctdb_ltdb_header *)data.dptr;
740 if (hdr->dmaster == v->rec->ctdb->pnn) {
741 /* its already local */
743 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
749 state = ctdb_call_send(v->ctdb_db, &call);
750 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
752 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
756 state->async.fn = vacuum_fetch_callback;
757 state->async.private_data = v;
766 destroy a vacuum info structure
768 static int vacuum_info_destructor(struct vacuum_info *v)
770 DLIST_REMOVE(v->rec->vacuum_info, v);
776 handler for vacuum fetch
778 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
779 TDB_DATA data, void *private_data)
781 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
782 struct ctdb_marshall_buffer *recs;
784 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
786 struct ctdb_dbid_map *dbmap=NULL;
787 bool persistent = false;
788 struct ctdb_db_context *ctdb_db;
789 struct ctdb_rec_data *r;
791 struct vacuum_info *v;
793 recs = (struct ctdb_marshall_buffer *)data.dptr;
794 r = (struct ctdb_rec_data *)&recs->data[0];
796 if (recs->count == 0) {
797 talloc_free(tmp_ctx);
803 for (v=rec->vacuum_info;v;v=v->next) {
804 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
805 /* we're already working on records from this node */
806 talloc_free(tmp_ctx);
811 /* work out if the database is persistent */
812 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
814 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
815 talloc_free(tmp_ctx);
819 for (i=0;i<dbmap->num;i++) {
820 if (dbmap->dbs[i].dbid == recs->db_id) {
821 persistent = dbmap->dbs[i].persistent;
825 if (i == dbmap->num) {
826 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
827 talloc_free(tmp_ctx);
831 /* find the name of this database */
832 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
833 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
834 talloc_free(tmp_ctx);
839 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
840 if (ctdb_db == NULL) {
841 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
842 talloc_free(tmp_ctx);
846 v = talloc_zero(rec, struct vacuum_info);
848 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
849 talloc_free(tmp_ctx);
854 v->srcnode = srcnode;
855 v->ctdb_db = ctdb_db;
856 v->recs = talloc_memdup(v, recs, data.dsize);
857 if (v->recs == NULL) {
858 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
860 talloc_free(tmp_ctx);
863 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
865 DLIST_ADD(rec->vacuum_info, v);
867 talloc_set_destructor(v, vacuum_info_destructor);
869 vacuum_fetch_next(v);
870 talloc_free(tmp_ctx);
875 called when ctdb_wait_timeout should finish
877 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
878 struct timeval yt, void *p)
880 uint32_t *timed_out = (uint32_t *)p;
885 wait for a given number of seconds
887 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
889 uint32_t timed_out = 0;
890 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
892 event_loop_once(ctdb->ev);
897 called when an election times out (ends)
899 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
900 struct timeval t, void *p)
902 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
903 rec->election_timeout = NULL;
905 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
910 wait for an election to finish. It finished election_timeout seconds after
911 the last election packet is received
913 static void ctdb_wait_election(struct ctdb_recoverd *rec)
915 struct ctdb_context *ctdb = rec->ctdb;
916 while (rec->election_timeout) {
917 event_loop_once(ctdb->ev);
922 Update our local flags from all remote connected nodes.
923 This is only run when we are or we belive we are the recovery master
925 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
928 struct ctdb_context *ctdb = rec->ctdb;
929 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
931 /* get the nodemap for all active remote nodes and verify
932 they are the same as for this node
934 for (j=0; j<nodemap->num; j++) {
935 struct ctdb_node_map *remote_nodemap=NULL;
938 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
941 if (nodemap->nodes[j].pnn == ctdb->pnn) {
945 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
946 mem_ctx, &remote_nodemap);
948 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
949 nodemap->nodes[j].pnn));
950 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
951 talloc_free(mem_ctx);
952 return MONITOR_FAILED;
954 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
955 /* We should tell our daemon about this so it
956 updates its flags or else we will log the same
957 message again in the next iteration of recovery.
958 Since we are the recovery master we can just as
959 well update the flags on all nodes.
961 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
963 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
967 /* Update our local copy of the flags in the recovery
970 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
971 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
972 nodemap->nodes[j].flags));
973 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
975 talloc_free(remote_nodemap);
977 talloc_free(mem_ctx);
982 /* Create a new random generation ip.
983 The generation id can not be the INVALID_GENERATION id
985 static uint32_t new_generation(void)
990 generation = random();
992 if (generation != INVALID_GENERATION) {
1002 create a temporary working database
1004 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1007 struct tdb_wrap *recdb;
1010 /* open up the temporary recovery database */
1011 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1017 tdb_flags = TDB_NOLOCK;
1018 if (!ctdb->do_setsched) {
1019 tdb_flags |= TDB_NOMMAP;
1022 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1023 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1024 if (recdb == NULL) {
1025 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1035 a traverse function for pulling all relevent records from recdb
1038 struct ctdb_context *ctdb;
1039 struct ctdb_marshall_buffer *recdata;
1044 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1046 struct recdb_data *params = (struct recdb_data *)p;
1047 struct ctdb_rec_data *rec;
1048 struct ctdb_ltdb_header *hdr;
1050 /* skip empty records */
1051 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1055 /* update the dmaster field to point to us */
1056 hdr = (struct ctdb_ltdb_header *)data.dptr;
1057 hdr->dmaster = params->ctdb->pnn;
1059 /* add the record to the blob ready to send to the nodes */
1060 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1062 params->failed = true;
1065 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1066 if (params->recdata == NULL) {
1067 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1068 rec->length + params->len, params->recdata->count));
1069 params->failed = true;
1072 params->recdata->count++;
1073 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1074 params->len += rec->length;
1081 push the recdb database out to all nodes
1083 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1084 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1086 struct recdb_data params;
1087 struct ctdb_marshall_buffer *recdata;
1089 TALLOC_CTX *tmp_ctx;
1092 tmp_ctx = talloc_new(ctdb);
1093 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1095 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1096 CTDB_NO_MEMORY(ctdb, recdata);
1098 recdata->db_id = dbid;
1101 params.recdata = recdata;
1102 params.len = offsetof(struct ctdb_marshall_buffer, data);
1103 params.failed = false;
1105 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1106 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1107 talloc_free(params.recdata);
1108 talloc_free(tmp_ctx);
1112 if (params.failed) {
1113 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1114 talloc_free(params.recdata);
1115 talloc_free(tmp_ctx);
1119 recdata = params.recdata;
1121 outdata.dptr = (void *)recdata;
1122 outdata.dsize = params.len;
1124 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1125 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1127 CONTROL_TIMEOUT(), false, outdata,
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1131 talloc_free(recdata);
1132 talloc_free(tmp_ctx);
1136 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1137 dbid, recdata->count));
1139 talloc_free(recdata);
1140 talloc_free(tmp_ctx);
1147 go through a full recovery on one database
1149 static int recover_database(struct ctdb_recoverd *rec,
1150 TALLOC_CTX *mem_ctx,
1153 struct ctdb_node_map *nodemap,
1154 uint32_t transaction_id)
1156 struct tdb_wrap *recdb;
1158 struct ctdb_context *ctdb = rec->ctdb;
1160 struct ctdb_control_wipe_database w;
1163 recdb = create_recdb(ctdb, mem_ctx);
1164 if (recdb == NULL) {
1168 /* pull all remote databases onto the recdb */
1169 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1171 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1175 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1177 /* wipe all the remote databases. This is safe as we are in a transaction */
1179 w.transaction_id = transaction_id;
1181 data.dptr = (void *)&w;
1182 data.dsize = sizeof(w);
1184 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1185 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1187 CONTROL_TIMEOUT(), false, data,
1190 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1195 /* push out the correct database. This sets the dmaster and skips
1196 the empty records */
1197 ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1203 /* all done with this database */
1210 reload the nodes file
1212 static void reload_nodes_file(struct ctdb_context *ctdb)
1215 ctdb_load_nodes_file(ctdb);
1220 we are the recmaster, and recovery is needed - start a recovery run
1222 static int do_recovery(struct ctdb_recoverd *rec,
1223 TALLOC_CTX *mem_ctx, uint32_t pnn,
1224 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1226 struct ctdb_context *ctdb = rec->ctdb;
1228 uint32_t generation;
1229 struct ctdb_dbid_map *dbmap;
1232 struct timeval start_time;
1234 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1236 /* if recovery fails, force it again */
1237 rec->need_recovery = true;
1239 for (i=0; i<ctdb->num_nodes; i++) {
1240 struct ctdb_banning_state *ban_state;
1242 if (ctdb->nodes[i]->ban_state == NULL) {
1245 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1246 if (ban_state->count < 2*ctdb->num_nodes) {
1249 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1250 ctdb->nodes[i]->pnn, ban_state->count,
1251 ctdb->tunable.recovery_ban_period));
1252 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1253 ban_state->count = 0;
1257 if (ctdb->tunable.verify_recovery_lock != 0) {
1258 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1259 start_time = timeval_current();
1260 if (!ctdb_recovery_lock(ctdb, true)) {
1261 ctdb_set_culprit(rec, pnn);
1262 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1265 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1266 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1269 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1271 /* get a list of all databases */
1272 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1274 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1278 /* we do the db creation before we set the recovery mode, so the freeze happens
1279 on all databases we will be dealing with. */
1281 /* verify that we have all the databases any other node has */
1282 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1284 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1288 /* verify that all other nodes have all our databases */
1289 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1291 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1294 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1296 /* update the database priority for all remote databases */
1297 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1299 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1301 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1304 /* set recovery mode to active on all nodes */
1305 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1307 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1311 /* execute the "startrecovery" event script on all nodes */
1312 ret = run_startrecovery_eventscript(rec, nodemap);
1314 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1318 /* pick a new generation number */
1319 generation = new_generation();
1321 /* change the vnnmap on this node to use the new generation
1322 number but not on any other nodes.
1323 this guarantees that if we abort the recovery prematurely
1324 for some reason (a node stops responding?)
1325 that we can just return immediately and we will reenter
1326 recovery shortly again.
1327 I.e. we deliberately leave the cluster with an inconsistent
1328 generation id to allow us to abort recovery at any stage and
1329 just restart it from scratch.
1331 vnnmap->generation = generation;
1332 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1334 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1338 data.dptr = (void *)&generation;
1339 data.dsize = sizeof(uint32_t);
1341 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1342 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1344 CONTROL_TIMEOUT(), false, data,
1346 transaction_start_fail_callback,
1348 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1349 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1351 CONTROL_TIMEOUT(), false, tdb_null,
1355 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1360 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1362 for (i=0;i<dbmap->num;i++) {
1363 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1364 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1369 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1371 /* commit all the changes */
1372 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1374 CONTROL_TIMEOUT(), false, data,
1377 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1381 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1384 /* update the capabilities for all nodes */
1385 ret = update_capabilities(ctdb, nodemap);
1387 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1391 /* build a new vnn map with all the currently active and
1393 generation = new_generation();
1394 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1395 CTDB_NO_MEMORY(ctdb, vnnmap);
1396 vnnmap->generation = generation;
1398 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1399 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1400 for (i=j=0;i<nodemap->num;i++) {
1401 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1404 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1405 /* this node can not be an lmaster */
1406 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1411 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1412 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1413 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1416 if (vnnmap->size == 0) {
1417 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1419 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1420 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1421 vnnmap->map[0] = pnn;
1424 /* update to the new vnnmap on all nodes */
1425 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1427 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1431 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1433 /* update recmaster to point to us for all nodes */
1434 ret = set_recovery_master(ctdb, nodemap, pnn);
1436 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1440 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1443 update all nodes to have the same flags that we have
1445 for (i=0;i<nodemap->num;i++) {
1446 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1450 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1452 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1457 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1459 /* disable recovery mode */
1460 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1462 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1466 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1469 tell nodes to takeover their public IPs
1471 rec->need_takeover_run = false;
1472 ret = ctdb_takeover_run(ctdb, nodemap);
1474 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1477 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1479 /* execute the "recovered" event script on all nodes */
1480 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1482 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1486 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1488 /* send a message to all clients telling them that the cluster
1489 has been reconfigured */
1490 ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1492 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1494 rec->need_recovery = false;
1496 /* we managed to complete a full recovery, make sure to forgive
1497 any past sins by the nodes that could now participate in the
1500 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1501 for (i=0;i<nodemap->num;i++) {
1502 struct ctdb_banning_state *ban_state;
1504 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1508 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1509 if (ban_state == NULL) {
1513 ban_state->count = 0;
1517 /* We just finished a recovery successfully.
1518 We now wait for rerecovery_timeout before we allow
1519 another recovery to take place.
1521 DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1522 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1523 DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1530 elections are won by first checking the number of connected nodes, then
1531 the priority time, then the pnn
1533 struct election_message {
1534 uint32_t num_connected;
1535 struct timeval priority_time;
1537 uint32_t node_flags;
1541 form this nodes election data
1543 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1546 struct ctdb_node_map *nodemap;
1547 struct ctdb_context *ctdb = rec->ctdb;
1551 em->pnn = rec->ctdb->pnn;
1552 em->priority_time = rec->priority_time;
1554 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1556 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1560 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1561 em->node_flags = rec->node_flags;
1563 for (i=0;i<nodemap->num;i++) {
1564 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1565 em->num_connected++;
1569 /* we shouldnt try to win this election if we cant be a recmaster */
1570 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1571 em->num_connected = 0;
1572 em->priority_time = timeval_current();
1575 talloc_free(nodemap);
1579 see if the given election data wins
1581 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1583 struct election_message myem;
1586 ctdb_election_data(rec, &myem);
1588 /* we cant win if we dont have the recmaster capability */
1589 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1593 /* we cant win if we are banned */
1594 if (rec->node_flags & NODE_FLAGS_BANNED) {
1598 /* we cant win if we are stopped */
1599 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1603 /* we will automatically win if the other node is banned */
1604 if (em->node_flags & NODE_FLAGS_BANNED) {
1608 /* we will automatically win if the other node is banned */
1609 if (em->node_flags & NODE_FLAGS_STOPPED) {
1613 /* try to use the most connected node */
1615 cmp = (int)myem.num_connected - (int)em->num_connected;
1618 /* then the longest running node */
1620 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1624 cmp = (int)myem.pnn - (int)em->pnn;
1631 send out an election request
1633 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1636 TDB_DATA election_data;
1637 struct election_message emsg;
1639 struct ctdb_context *ctdb = rec->ctdb;
1641 srvid = CTDB_SRVID_RECOVERY;
1643 ctdb_election_data(rec, &emsg);
1645 election_data.dsize = sizeof(struct election_message);
1646 election_data.dptr = (unsigned char *)&emsg;
1649 /* send an election message to all active nodes */
1650 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1651 ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1654 /* A new node that is already frozen has entered the cluster.
1655 The existing nodes are not frozen and dont need to be frozen
1656 until the election has ended and we start the actual recovery
1658 if (update_recmaster == true) {
1659 /* first we assume we will win the election and set
1660 recoverymaster to be ourself on the current node
1662 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1664 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1674 this function will unban all nodes in the cluster
1676 static void unban_all_nodes(struct ctdb_context *ctdb)
1679 struct ctdb_node_map *nodemap;
1680 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1682 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1684 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1688 for (i=0;i<nodemap->num;i++) {
1689 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1690 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1691 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1695 talloc_free(tmp_ctx);
1700 we think we are winning the election - send a broadcast election request
1702 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1704 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1707 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1709 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1712 talloc_free(rec->send_election_te);
1713 rec->send_election_te = NULL;
1717 handler for memory dumps
1719 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1720 TDB_DATA data, void *private_data)
1722 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1725 struct rd_memdump_reply *rd;
1727 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1728 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1729 talloc_free(tmp_ctx);
1732 rd = (struct rd_memdump_reply *)data.dptr;
1734 dump = talloc_zero(tmp_ctx, TDB_DATA);
1736 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1737 talloc_free(tmp_ctx);
1740 ret = ctdb_dump_memory(ctdb, dump);
1742 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1743 talloc_free(tmp_ctx);
1747 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1749 ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1751 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1752 talloc_free(tmp_ctx);
1756 talloc_free(tmp_ctx);
1760 handler for reload_nodes
1762 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1763 TDB_DATA data, void *private_data)
1765 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1767 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1769 reload_nodes_file(rec->ctdb);
1773 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1774 struct timeval yt, void *p)
1776 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1778 talloc_free(rec->ip_check_disable_ctx);
1779 rec->ip_check_disable_ctx = NULL;
1782 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1783 TDB_DATA data, void *private_data)
1785 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1788 if (rec->ip_check_disable_ctx != NULL) {
1789 talloc_free(rec->ip_check_disable_ctx);
1790 rec->ip_check_disable_ctx = NULL;
1793 if (data.dsize != sizeof(uint32_t)) {
1794 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu expexting %lu\n", data.dsize, sizeof(uint32_t)));
1797 if (data.dptr == NULL) {
1798 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1802 timeout = *((uint32_t *)data.dptr);
1803 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1805 rec->ip_check_disable_ctx = talloc_new(rec);
1806 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1808 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1813 handler for ip reallocate, just add it to the list of callers and
1814 handle this later in the monitor_cluster loop so we do not recurse
1815 with other callers to takeover_run()
1817 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1818 TDB_DATA data, void *private_data)
1820 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1821 struct ip_reallocate_list *caller;
1823 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1824 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1828 if (rec->ip_reallocate_ctx == NULL) {
1829 rec->ip_reallocate_ctx = talloc_new(rec);
1830 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1833 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1834 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1836 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1837 caller->next = rec->reallocate_callers;
1838 rec->reallocate_callers = caller;
1843 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1845 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1848 struct ip_reallocate_list *callers;
1850 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1851 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1852 result.dsize = sizeof(int32_t);
1853 result.dptr = (uint8_t *)&ret;
1855 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1856 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1857 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1859 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1863 talloc_free(tmp_ctx);
1864 talloc_free(rec->ip_reallocate_ctx);
1865 rec->ip_reallocate_ctx = NULL;
1866 rec->reallocate_callers = NULL;
1872 handler for recovery master elections
1874 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1875 TDB_DATA data, void *private_data)
1877 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1879 struct election_message *em = (struct election_message *)data.dptr;
1880 TALLOC_CTX *mem_ctx;
1882 /* we got an election packet - update the timeout for the election */
1883 talloc_free(rec->election_timeout);
1884 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1885 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1886 ctdb_election_timeout, rec);
1888 mem_ctx = talloc_new(ctdb);
1890 /* someone called an election. check their election data
1891 and if we disagree and we would rather be the elected node,
1892 send a new election message to all other nodes
1894 if (ctdb_election_win(rec, em)) {
1895 if (!rec->send_election_te) {
1896 rec->send_election_te = event_add_timed(ctdb->ev, rec,
1897 timeval_current_ofs(0, 500000),
1898 election_send_request, rec);
1900 talloc_free(mem_ctx);
1901 /*unban_all_nodes(ctdb);*/
1906 talloc_free(rec->send_election_te);
1907 rec->send_election_te = NULL;
1909 if (ctdb->tunable.verify_recovery_lock != 0) {
1910 /* release the recmaster lock */
1911 if (em->pnn != ctdb->pnn &&
1912 ctdb->recovery_lock_fd != -1) {
1913 close(ctdb->recovery_lock_fd);
1914 ctdb->recovery_lock_fd = -1;
1915 unban_all_nodes(ctdb);
1919 /* ok, let that guy become recmaster then */
1920 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1922 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1923 talloc_free(mem_ctx);
1927 talloc_free(mem_ctx);
1933 force the start of the election process
1935 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1936 struct ctdb_node_map *nodemap)
1939 struct ctdb_context *ctdb = rec->ctdb;
1941 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1943 /* set all nodes to recovery mode to stop all internode traffic */
1944 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1946 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1950 talloc_free(rec->election_timeout);
1951 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1952 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1953 ctdb_election_timeout, rec);
1955 ret = send_election_request(rec, pnn, true);
1957 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1961 /* wait for a few seconds to collect all responses */
1962 ctdb_wait_election(rec);
1968 handler for when a node changes its flags
1970 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1971 TDB_DATA data, void *private_data)
1974 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1975 struct ctdb_node_map *nodemap=NULL;
1976 TALLOC_CTX *tmp_ctx;
1977 uint32_t changed_flags;
1979 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1980 int disabled_flag_changed;
1982 if (data.dsize != sizeof(*c)) {
1983 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1987 tmp_ctx = talloc_new(ctdb);
1988 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1990 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1992 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1993 talloc_free(tmp_ctx);
1998 for (i=0;i<nodemap->num;i++) {
1999 if (nodemap->nodes[i].pnn == c->pnn) break;
2002 if (i == nodemap->num) {
2003 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2004 talloc_free(tmp_ctx);
2008 changed_flags = c->old_flags ^ c->new_flags;
2010 if (nodemap->nodes[i].flags != c->new_flags) {
2011 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2014 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2016 nodemap->nodes[i].flags = c->new_flags;
2018 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2019 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2022 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2023 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2027 ctdb->recovery_master == ctdb->pnn &&
2028 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2029 /* Only do the takeover run if the perm disabled or unhealthy
2030 flags changed since these will cause an ip failover but not
2032 If the node became disconnected or banned this will also
2033 lead to an ip address failover but that is handled
2036 if (disabled_flag_changed) {
2037 rec->need_takeover_run = true;
2041 talloc_free(tmp_ctx);
2045 handler for when we need to push out flag changes ot all other nodes
2047 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2048 TDB_DATA data, void *private_data)
2051 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2053 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2055 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2060 struct verify_recmode_normal_data {
2062 enum monitor_result status;
2065 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2067 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2070 /* one more node has responded with recmode data*/
2073 /* if we failed to get the recmode, then return an error and let
2074 the main loop try again.
2076 if (state->state != CTDB_CONTROL_DONE) {
2077 if (rmdata->status == MONITOR_OK) {
2078 rmdata->status = MONITOR_FAILED;
2083 /* if we got a response, then the recmode will be stored in the
2086 if (state->status != CTDB_RECOVERY_NORMAL) {
2087 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2088 rmdata->status = MONITOR_RECOVERY_NEEDED;
2095 /* verify that all nodes are in normal recovery mode */
2096 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2098 struct verify_recmode_normal_data *rmdata;
2099 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2100 struct ctdb_client_control_state *state;
2101 enum monitor_result status;
2104 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2105 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2107 rmdata->status = MONITOR_OK;
2109 /* loop over all active nodes and send an async getrecmode call to
2111 for (j=0; j<nodemap->num; j++) {
2112 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2115 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2117 nodemap->nodes[j].pnn);
2118 if (state == NULL) {
2119 /* we failed to send the control, treat this as
2120 an error and try again next iteration
2122 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2123 talloc_free(mem_ctx);
2124 return MONITOR_FAILED;
2127 /* set up the callback functions */
2128 state->async.fn = verify_recmode_normal_callback;
2129 state->async.private_data = rmdata;
2131 /* one more control to wait for to complete */
2136 /* now wait for up to the maximum number of seconds allowed
2137 or until all nodes we expect a response from has replied
2139 while (rmdata->count > 0) {
2140 event_loop_once(ctdb->ev);
2143 status = rmdata->status;
2144 talloc_free(mem_ctx);
2149 struct verify_recmaster_data {
2150 struct ctdb_recoverd *rec;
2153 enum monitor_result status;
2156 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2158 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2161 /* one more node has responded with recmaster data*/
2164 /* if we failed to get the recmaster, then return an error and let
2165 the main loop try again.
2167 if (state->state != CTDB_CONTROL_DONE) {
2168 if (rmdata->status == MONITOR_OK) {
2169 rmdata->status = MONITOR_FAILED;
2174 /* if we got a response, then the recmaster will be stored in the
2177 if (state->status != rmdata->pnn) {
2178 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2179 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2180 rmdata->status = MONITOR_ELECTION_NEEDED;
2187 /* verify that all nodes agree that we are the recmaster */
2188 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2190 struct ctdb_context *ctdb = rec->ctdb;
2191 struct verify_recmaster_data *rmdata;
2192 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2193 struct ctdb_client_control_state *state;
2194 enum monitor_result status;
2197 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2198 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2202 rmdata->status = MONITOR_OK;
2204 /* loop over all active nodes and send an async getrecmaster call to
2206 for (j=0; j<nodemap->num; j++) {
2207 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2210 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2212 nodemap->nodes[j].pnn);
2213 if (state == NULL) {
2214 /* we failed to send the control, treat this as
2215 an error and try again next iteration
2217 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2218 talloc_free(mem_ctx);
2219 return MONITOR_FAILED;
2222 /* set up the callback functions */
2223 state->async.fn = verify_recmaster_callback;
2224 state->async.private_data = rmdata;
2226 /* one more control to wait for to complete */
2231 /* now wait for up to the maximum number of seconds allowed
2232 or until all nodes we expect a response from has replied
2234 while (rmdata->count > 0) {
2235 event_loop_once(ctdb->ev);
2238 status = rmdata->status;
2239 talloc_free(mem_ctx);
2244 /* called to check that the allocation of public ip addresses is ok.
2246 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2248 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2249 struct ctdb_all_public_ips *ips = NULL;
2250 struct ctdb_uptime *uptime1 = NULL;
2251 struct ctdb_uptime *uptime2 = NULL;
2254 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2255 CTDB_CURRENT_NODE, &uptime1);
2257 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2258 talloc_free(mem_ctx);
2262 /* read the ip allocation from the local node */
2263 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2265 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2266 talloc_free(mem_ctx);
2270 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2271 CTDB_CURRENT_NODE, &uptime2);
2273 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2274 talloc_free(mem_ctx);
2278 /* skip the check if the startrecovery time has changed */
2279 if (timeval_compare(&uptime1->last_recovery_started,
2280 &uptime2->last_recovery_started) != 0) {
2281 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2282 talloc_free(mem_ctx);
2286 /* skip the check if the endrecovery time has changed */
2287 if (timeval_compare(&uptime1->last_recovery_finished,
2288 &uptime2->last_recovery_finished) != 0) {
2289 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2290 talloc_free(mem_ctx);
2294 /* skip the check if we have started but not finished recovery */
2295 if (timeval_compare(&uptime1->last_recovery_finished,
2296 &uptime1->last_recovery_started) != 1) {
2297 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2298 talloc_free(mem_ctx);
2303 /* verify that we have the ip addresses we should have
2304 and we dont have ones we shouldnt have.
2305 if we find an inconsistency we set recmode to
2306 active on the local node and wait for the recmaster
2307 to do a full blown recovery
2309 for (j=0; j<ips->num; j++) {
2310 if (ips->ips[j].pnn == pnn) {
2311 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2312 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2313 ctdb_addr_to_str(&ips->ips[j].addr)));
2314 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2316 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2318 talloc_free(mem_ctx);
2321 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2323 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2325 talloc_free(mem_ctx);
2330 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2331 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2332 ctdb_addr_to_str(&ips->ips[j].addr)));
2334 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2336 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2338 talloc_free(mem_ctx);
2341 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2343 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2345 talloc_free(mem_ctx);
2352 talloc_free(mem_ctx);
2357 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2359 struct ctdb_node_map **remote_nodemaps = callback_data;
2361 if (node_pnn >= ctdb->num_nodes) {
2362 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2366 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2370 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2371 struct ctdb_node_map *nodemap,
2372 struct ctdb_node_map **remote_nodemaps)
2376 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2377 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2379 CONTROL_TIMEOUT(), false, tdb_null,
2380 async_getnodemap_callback,
2382 remote_nodemaps) != 0) {
2383 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2391 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2392 struct ctdb_check_reclock_state {
2393 struct ctdb_context *ctdb;
2394 struct timeval start_time;
2397 struct timed_event *te;
2398 struct fd_event *fde;
2399 enum reclock_child_status status;
2402 /* when we free the reclock state we must kill any child process.
2404 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2406 struct ctdb_context *ctdb = state->ctdb;
2408 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2410 if (state->fd[0] != -1) {
2411 close(state->fd[0]);
2414 if (state->fd[1] != -1) {
2415 close(state->fd[1]);
2418 kill(state->child, SIGKILL);
2423 called if our check_reclock child times out. this would happen if
2424 i/o to the reclock file blocks.
2426 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2427 struct timeval t, void *private_data)
2429 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2430 struct ctdb_check_reclock_state);
2432 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2433 state->status = RECLOCK_TIMEOUT;
2436 /* this is called when the child process has completed checking the reclock
2437 file and has written data back to us through the pipe.
2439 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2440 uint16_t flags, void *private_data)
2442 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2443 struct ctdb_check_reclock_state);
2447 /* we got a response from our child process so we can abort the
2450 talloc_free(state->te);
2453 ret = read(state->fd[0], &c, 1);
2454 if (ret != 1 || c != RECLOCK_OK) {
2455 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2456 state->status = RECLOCK_FAILED;
2461 state->status = RECLOCK_OK;
2465 static int check_recovery_lock(struct ctdb_context *ctdb)
2468 struct ctdb_check_reclock_state *state;
2469 pid_t parent = getpid();
2471 if (ctdb->recovery_lock_fd == -1) {
2472 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2476 state = talloc(ctdb, struct ctdb_check_reclock_state);
2477 CTDB_NO_MEMORY(ctdb, state);
2480 state->start_time = timeval_current();
2481 state->status = RECLOCK_CHECKING;
2485 ret = pipe(state->fd);
2488 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2492 state->child = fork();
2493 if (state->child == (pid_t)-1) {
2494 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2495 close(state->fd[0]);
2497 close(state->fd[1]);
2503 if (state->child == 0) {
2504 char cc = RECLOCK_OK;
2505 close(state->fd[0]);
2508 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2509 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2510 cc = RECLOCK_FAILED;
2513 write(state->fd[1], &cc, 1);
2514 /* make sure we die when our parent dies */
2515 while (kill(parent, 0) == 0 || errno != ESRCH) {
2517 write(state->fd[1], &cc, 1);
2521 close(state->fd[1]);
2523 set_close_on_exec(state->fd[0]);
2525 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2527 talloc_set_destructor(state, check_reclock_destructor);
2529 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2530 ctdb_check_reclock_timeout, state);
2531 if (state->te == NULL) {
2532 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2537 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2538 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2539 reclock_child_handler,
2542 if (state->fde == NULL) {
2543 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2548 while (state->status == RECLOCK_CHECKING) {
2549 event_loop_once(ctdb->ev);
2552 if (state->status == RECLOCK_FAILED) {
2553 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2554 close(ctdb->recovery_lock_fd);
2555 ctdb->recovery_lock_fd = -1;
2564 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2566 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2567 const char *reclockfile;
2569 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2570 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2571 talloc_free(tmp_ctx);
2575 if (reclockfile == NULL) {
2576 if (ctdb->recovery_lock_file != NULL) {
2577 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2578 talloc_free(ctdb->recovery_lock_file);
2579 ctdb->recovery_lock_file = NULL;
2580 if (ctdb->recovery_lock_fd != -1) {
2581 close(ctdb->recovery_lock_fd);
2582 ctdb->recovery_lock_fd = -1;
2585 ctdb->tunable.verify_recovery_lock = 0;
2586 talloc_free(tmp_ctx);
2590 if (ctdb->recovery_lock_file == NULL) {
2591 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2592 if (ctdb->recovery_lock_fd != -1) {
2593 close(ctdb->recovery_lock_fd);
2594 ctdb->recovery_lock_fd = -1;
2596 talloc_free(tmp_ctx);
2601 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2602 talloc_free(tmp_ctx);
2606 talloc_free(ctdb->recovery_lock_file);
2607 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2608 ctdb->tunable.verify_recovery_lock = 0;
2609 if (ctdb->recovery_lock_fd != -1) {
2610 close(ctdb->recovery_lock_fd);
2611 ctdb->recovery_lock_fd = -1;
2614 talloc_free(tmp_ctx);
2619 the main monitoring loop
2621 static void monitor_cluster(struct ctdb_context *ctdb)
2624 TALLOC_CTX *mem_ctx=NULL;
2625 struct ctdb_node_map *nodemap=NULL;
2626 struct ctdb_node_map *recmaster_nodemap=NULL;
2627 struct ctdb_node_map **remote_nodemaps=NULL;
2628 struct ctdb_vnn_map *vnnmap=NULL;
2629 struct ctdb_vnn_map *remote_vnnmap=NULL;
2630 int32_t debug_level;
2632 struct ctdb_recoverd *rec;
2634 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2636 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2637 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2641 rec->priority_time = timeval_current();
2643 /* register a message port for sending memory dumps */
2644 ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2646 /* register a message port for recovery elections */
2647 ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2649 /* when nodes are disabled/enabled */
2650 ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2652 /* when we are asked to puch out a flag change */
2653 ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2655 /* register a message port for vacuum fetch */
2656 ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2658 /* register a message port for reloadnodes */
2659 ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2661 /* register a message port for performing a takeover run */
2662 ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2664 /* register a message port for disabling the ip check for a short while */
2665 ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2669 talloc_free(mem_ctx);
2672 mem_ctx = talloc_new(ctdb);
2674 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2678 /* we only check for recovery once every second */
2679 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2681 /* verify that the main daemon is still running */
2682 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2683 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2687 /* ping the local daemon to tell it we are alive */
2688 ctdb_ctrl_recd_ping(ctdb);
2690 if (rec->election_timeout) {
2691 /* an election is in progress */
2695 /* read the debug level from the parent and update locally */
2696 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2698 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2701 LogLevel = debug_level;
2704 /* We must check if we need to ban a node here but we want to do this
2705 as early as possible so we dont wait until we have pulled the node
2706 map from the local node. thats why we have the hardcoded value 20
2708 for (i=0; i<ctdb->num_nodes; i++) {
2709 struct ctdb_banning_state *ban_state;
2711 if (ctdb->nodes[i]->ban_state == NULL) {
2714 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2715 if (ban_state->count < 20) {
2718 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2719 ctdb->nodes[i]->pnn, ban_state->count,
2720 ctdb->tunable.recovery_ban_period));
2721 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2722 ban_state->count = 0;
2725 /* get relevant tunables */
2726 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2728 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2732 /* get the current recovery lock file from the server */
2733 if (update_recovery_lock_file(ctdb) != 0) {
2734 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2738 /* Make sure that if recovery lock verification becomes disabled when
2741 if (ctdb->tunable.verify_recovery_lock == 0) {
2742 if (ctdb->recovery_lock_fd != -1) {
2743 close(ctdb->recovery_lock_fd);
2744 ctdb->recovery_lock_fd = -1;
2748 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2749 if (pnn == (uint32_t)-1) {
2750 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2754 /* get the vnnmap */
2755 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2757 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2762 /* get number of nodes */
2764 talloc_free(rec->nodemap);
2765 rec->nodemap = NULL;
2768 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2770 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2773 nodemap = rec->nodemap;
2775 /* check which node is the recovery master */
2776 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2778 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2782 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2783 if (rec->recmaster != pnn) {
2784 if (rec->ip_reallocate_ctx != NULL) {
2785 talloc_free(rec->ip_reallocate_ctx);
2786 rec->ip_reallocate_ctx = NULL;
2787 rec->reallocate_callers = NULL;
2790 /* if there are takeovers requested, perform it and notify the waiters */
2791 if (rec->reallocate_callers) {
2792 process_ipreallocate_requests(ctdb, rec);
2795 if (rec->recmaster == (uint32_t)-1) {
2796 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2797 force_election(rec, pnn, nodemap);
2802 /* if the local daemon is STOPPED, we verify that the databases are
2803 also frozen and thet the recmode is set to active
2805 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2806 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2808 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2810 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2811 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2813 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2815 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2818 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2820 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2827 /* If the local node is stopped, verify we are not the recmaster
2828 and yield this role if so
2830 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2831 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2832 force_election(rec, pnn, nodemap);
2836 /* check that we (recovery daemon) and the local ctdb daemon
2837 agrees on whether we are banned or not
2841 /* remember our own node flags */
2842 rec->node_flags = nodemap->nodes[pnn].flags;
2844 /* count how many active nodes there are */
2845 rec->num_active = 0;
2846 rec->num_connected = 0;
2847 for (i=0; i<nodemap->num; i++) {
2848 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2851 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2852 rec->num_connected++;
2857 /* verify that the recmaster node is still active */
2858 for (j=0; j<nodemap->num; j++) {
2859 if (nodemap->nodes[j].pnn==rec->recmaster) {
2864 if (j == nodemap->num) {
2865 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2866 force_election(rec, pnn, nodemap);
2870 /* if recovery master is disconnected we must elect a new recmaster */
2871 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2872 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2873 force_election(rec, pnn, nodemap);
2877 /* grap the nodemap from the recovery master to check if it is banned */
2878 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2879 mem_ctx, &recmaster_nodemap);
2881 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2882 nodemap->nodes[j].pnn));
2887 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2888 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2889 force_election(rec, pnn, nodemap);
2894 /* verify that we have all ip addresses we should have and we dont
2895 * have addresses we shouldnt have.
2897 if (ctdb->do_checkpublicip) {
2898 if (rec->ip_check_disable_ctx == NULL) {
2899 if (verify_ip_allocation(ctdb, pnn) != 0) {
2900 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2907 /* if we are not the recmaster then we do not need to check
2908 if recovery is needed
2910 if (pnn != rec->recmaster) {
2915 /* ensure our local copies of flags are right */
2916 ret = update_local_flags(rec, nodemap);
2917 if (ret == MONITOR_ELECTION_NEEDED) {
2918 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2919 force_election(rec, pnn, nodemap);
2922 if (ret != MONITOR_OK) {
2923 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2927 /* update the list of public ips that a node can handle for
2930 if (ctdb->num_nodes != nodemap->num) {
2931 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2932 reload_nodes_file(ctdb);
2935 for (j=0; j<nodemap->num; j++) {
2936 /* release any existing data */
2937 if (ctdb->nodes[j]->public_ips) {
2938 talloc_free(ctdb->nodes[j]->public_ips);
2939 ctdb->nodes[j]->public_ips = NULL;
2942 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2946 /* grab a new shiny list of public ips from the node */
2947 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2948 ctdb->nodes[j]->pnn,
2950 &ctdb->nodes[j]->public_ips)) {
2951 DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2952 ctdb->nodes[j]->pnn));
2958 /* verify that all active nodes agree that we are the recmaster */
2959 switch (verify_recmaster(rec, nodemap, pnn)) {
2960 case MONITOR_RECOVERY_NEEDED:
2961 /* can not happen */
2963 case MONITOR_ELECTION_NEEDED:
2964 force_election(rec, pnn, nodemap);
2968 case MONITOR_FAILED:
2973 if (rec->need_recovery) {
2974 /* a previous recovery didn't finish */
2975 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2979 /* verify that all active nodes are in normal mode
2980 and not in recovery mode
2982 switch (verify_recmode(ctdb, nodemap)) {
2983 case MONITOR_RECOVERY_NEEDED:
2984 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2986 case MONITOR_FAILED:
2988 case MONITOR_ELECTION_NEEDED:
2989 /* can not happen */
2995 if (ctdb->tunable.verify_recovery_lock != 0) {
2996 /* we should have the reclock - check its not stale */
2997 ret = check_recovery_lock(ctdb);
2999 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3000 ctdb_set_culprit(rec, ctdb->pnn);
3001 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3006 /* get the nodemap for all active remote nodes
3008 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3009 if (remote_nodemaps == NULL) {
3010 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3013 for(i=0; i<nodemap->num; i++) {
3014 remote_nodemaps[i] = NULL;
3016 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3017 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3021 /* verify that all other nodes have the same nodemap as we have
3023 for (j=0; j<nodemap->num; j++) {
3024 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3028 if (remote_nodemaps[j] == NULL) {
3029 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3030 ctdb_set_culprit(rec, j);
3035 /* if the nodes disagree on how many nodes there are
3036 then this is a good reason to try recovery
3038 if (remote_nodemaps[j]->num != nodemap->num) {
3039 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3040 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3041 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3042 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3046 /* if the nodes disagree on which nodes exist and are
3047 active, then that is also a good reason to do recovery
3049 for (i=0;i<nodemap->num;i++) {
3050 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3051 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3052 nodemap->nodes[j].pnn, i,
3053 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3054 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3055 do_recovery(rec, mem_ctx, pnn, nodemap,
3061 /* verify the flags are consistent
3063 for (i=0; i<nodemap->num; i++) {
3064 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3068 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3069 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3070 nodemap->nodes[j].pnn,
3071 nodemap->nodes[i].pnn,
3072 remote_nodemaps[j]->nodes[i].flags,
3073 nodemap->nodes[j].flags));
3075 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3076 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3077 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3078 do_recovery(rec, mem_ctx, pnn, nodemap,
3082 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3083 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3084 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3085 do_recovery(rec, mem_ctx, pnn, nodemap,
3094 /* there better be the same number of lmasters in the vnn map
3095 as there are active nodes or we will have to do a recovery
3097 if (vnnmap->size != rec->num_active) {
3098 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3099 vnnmap->size, rec->num_active));
3100 ctdb_set_culprit(rec, ctdb->pnn);
3101 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3105 /* verify that all active nodes in the nodemap also exist in
3108 for (j=0; j<nodemap->num; j++) {
3109 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3112 if (nodemap->nodes[j].pnn == pnn) {
3116 for (i=0; i<vnnmap->size; i++) {
3117 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3121 if (i == vnnmap->size) {
3122 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3123 nodemap->nodes[j].pnn));
3124 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3125 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3131 /* verify that all other nodes have the same vnnmap
3132 and are from the same generation
3134 for (j=0; j<nodemap->num; j++) {
3135 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3138 if (nodemap->nodes[j].pnn == pnn) {
3142 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3143 mem_ctx, &remote_vnnmap);
3145 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3146 nodemap->nodes[j].pnn));
3150 /* verify the vnnmap generation is the same */
3151 if (vnnmap->generation != remote_vnnmap->generation) {
3152 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3153 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3154 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3155 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3159 /* verify the vnnmap size is the same */
3160 if (vnnmap->size != remote_vnnmap->size) {
3161 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3162 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3163 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3164 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3168 /* verify the vnnmap is the same */
3169 for (i=0;i<vnnmap->size;i++) {
3170 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3171 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3172 nodemap->nodes[j].pnn));
3173 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3174 do_recovery(rec, mem_ctx, pnn, nodemap,
3181 /* we might need to change who has what IP assigned */
3182 if (rec->need_takeover_run) {
3183 rec->need_takeover_run = false;
3185 /* execute the "startrecovery" event script on all nodes */
3186 ret = run_startrecovery_eventscript(rec, nodemap);
3188 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3189 ctdb_set_culprit(rec, ctdb->pnn);
3190 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3193 ret = ctdb_takeover_run(ctdb, nodemap);
3195 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3196 ctdb_set_culprit(rec, ctdb->pnn);
3197 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3200 /* execute the "recovered" event script on all nodes */
3201 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3203 // we cant check whether the event completed successfully
3204 // since this script WILL fail if the node is in recovery mode
3205 // and if that race happens, the code here would just cause a second
3206 // cascading recovery.
3208 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3209 ctdb_set_culprit(rec, ctdb->pnn);
3210 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3221 event handler for when the main ctdbd dies
3223 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3224 uint16_t flags, void *private_data)
3226 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3231 called regularly to verify that the recovery daemon is still running
3233 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3234 struct timeval yt, void *p)
3236 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3238 if (kill(ctdb->recoverd_pid, 0) != 0) {
3239 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3241 ctdb_stop_recoverd(ctdb);
3242 ctdb_stop_keepalive(ctdb);
3243 ctdb_stop_monitoring(ctdb);
3244 ctdb_release_all_ips(ctdb);
3245 if (ctdb->methods != NULL) {
3246 ctdb->methods->shutdown(ctdb);
3248 ctdb_event_script(ctdb, "shutdown");
3253 event_add_timed(ctdb->ev, ctdb,
3254 timeval_current_ofs(30, 0),
3255 ctdb_check_recd, ctdb);
3258 static void recd_sig_child_handler(struct event_context *ev,
3259 struct signal_event *se, int signum, int count,
3263 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3268 pid = waitpid(-1, &status, WNOHANG);
3270 if (errno != ECHILD) {
3271 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3276 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3282 startup the recovery daemon as a child of the main ctdb daemon
3284 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3287 struct signal_event *se;
3289 if (pipe(fd) != 0) {
3293 ctdb->ctdbd_pid = getpid();
3295 ctdb->recoverd_pid = fork();
3296 if (ctdb->recoverd_pid == -1) {
3300 if (ctdb->recoverd_pid != 0) {
3302 event_add_timed(ctdb->ev, ctdb,
3303 timeval_current_ofs(30, 0),
3304 ctdb_check_recd, ctdb);
3310 srandom(getpid() ^ time(NULL));
3312 if (switch_from_server_to_client(ctdb) != 0) {
3313 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3317 DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3319 event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3320 ctdb_recoverd_parent, &fd[0]);
3322 /* set up a handler to pick up sigchld */
3323 se = event_add_signal(ctdb->ev, ctdb,
3325 recd_sig_child_handler,
3328 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3332 monitor_cluster(ctdb);
3334 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3339 shutdown the recovery daemon
3341 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3343 if (ctdb->recoverd_pid == 0) {
3347 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3348 kill(ctdb->recoverd_pid, SIGTERM);