4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
70 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
71 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
73 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
76 ban a node for a period of time
78 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
81 struct ctdb_context *ctdb = rec->ctdb;
82 struct ctdb_ban_time bantime;
84 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
86 if (!ctdb_validate_pnn(ctdb, pnn)) {
87 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
92 bantime.time = ban_time;
94 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
96 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
102 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
106 run the "recovered" eventscript on all nodes
108 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
113 tmp_ctx = talloc_new(ctdb);
114 CTDB_NO_MEMORY(ctdb, tmp_ctx);
116 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
117 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
119 CONTROL_TIMEOUT(), false, tdb_null,
122 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
124 talloc_free(tmp_ctx);
128 talloc_free(tmp_ctx);
133 remember the trouble maker
135 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
137 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
138 struct ctdb_banning_state *ban_state;
140 if (culprit > ctdb->num_nodes) {
141 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
145 if (ctdb->nodes[culprit]->ban_state == NULL) {
146 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
147 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
151 ban_state = ctdb->nodes[culprit]->ban_state;
152 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
153 /* this was the first time in a long while this node
154 misbehaved so we will forgive any old transgressions.
156 ban_state->count = 0;
159 ban_state->count += count;
160 ban_state->last_reported_time = timeval_current();
161 rec->last_culprit_node = culprit;
165 remember the trouble maker
167 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
169 ctdb_set_culprit_count(rec, culprit, 1);
173 /* this callback is called for every node that failed to execute the
176 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
178 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
180 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
182 ctdb_set_culprit(rec, node_pnn);
186 run the "startrecovery" eventscript on all nodes
188 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
192 struct ctdb_context *ctdb = rec->ctdb;
194 tmp_ctx = talloc_new(ctdb);
195 CTDB_NO_MEMORY(ctdb, tmp_ctx);
197 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
198 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
200 CONTROL_TIMEOUT(), false, tdb_null,
202 startrecovery_fail_callback,
204 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
205 talloc_free(tmp_ctx);
209 talloc_free(tmp_ctx);
213 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
215 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
216 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
219 if (node_pnn < ctdb->num_nodes) {
220 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
223 if (node_pnn == ctdb->pnn) {
224 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
229 update the node capabilities for all connected nodes
231 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
236 tmp_ctx = talloc_new(ctdb);
237 CTDB_NO_MEMORY(ctdb, tmp_ctx);
239 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
240 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
244 async_getcap_callback, NULL,
246 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
247 talloc_free(tmp_ctx);
251 talloc_free(tmp_ctx);
255 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
257 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
259 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
260 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
263 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
265 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
267 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
268 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
272 change recovery mode on all nodes
274 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
280 tmp_ctx = talloc_new(ctdb);
281 CTDB_NO_MEMORY(ctdb, tmp_ctx);
283 /* freeze all nodes */
284 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
285 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
288 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
289 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
294 set_recmode_fail_callback,
296 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
297 talloc_free(tmp_ctx);
304 data.dsize = sizeof(uint32_t);
305 data.dptr = (unsigned char *)&rec_mode;
307 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
313 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
314 talloc_free(tmp_ctx);
318 talloc_free(tmp_ctx);
323 change recovery master on all node
325 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
331 tmp_ctx = talloc_new(ctdb);
332 CTDB_NO_MEMORY(ctdb, tmp_ctx);
334 data.dsize = sizeof(uint32_t);
335 data.dptr = (unsigned char *)&pnn;
337 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
338 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
340 CONTROL_TIMEOUT(), false, data,
343 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
344 talloc_free(tmp_ctx);
348 talloc_free(tmp_ctx);
352 /* update all remote nodes to use the same db priority that we have
353 this can fail if the remove node has not yet been upgraded to
354 support this function, so we always return success and never fail
355 a recovery if this call fails.
357 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
358 struct ctdb_node_map *nodemap,
359 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
364 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
366 /* step through all local databases */
367 for (db=0; db<dbmap->num;db++) {
369 struct ctdb_db_priority db_prio;
372 db_prio.db_id = dbmap->dbs[db].dbid;
373 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
375 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
379 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
381 data.dptr = (uint8_t *)&db_prio;
382 data.dsize = sizeof(db_prio);
384 if (ctdb_client_async_control(ctdb,
385 CTDB_CONTROL_SET_DB_PRIORITY,
387 CONTROL_TIMEOUT(), false, data,
390 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
398 ensure all other nodes have attached to any databases that we have
400 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
401 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
404 struct ctdb_dbid_map *remote_dbmap;
406 /* verify that all other nodes have all our databases */
407 for (j=0; j<nodemap->num; j++) {
408 /* we dont need to ourself ourselves */
409 if (nodemap->nodes[j].pnn == pnn) {
412 /* dont check nodes that are unavailable */
413 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
417 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
418 mem_ctx, &remote_dbmap);
420 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
424 /* step through all local databases */
425 for (db=0; db<dbmap->num;db++) {
429 for (i=0;i<remote_dbmap->num;i++) {
430 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
434 /* the remote node already have this database */
435 if (i!=remote_dbmap->num) {
438 /* ok so we need to create this database */
439 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
442 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
445 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
447 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
449 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
460 ensure we are attached to any databases that anyone else is attached to
462 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
463 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
466 struct ctdb_dbid_map *remote_dbmap;
468 /* verify that we have all database any other node has */
469 for (j=0; j<nodemap->num; j++) {
470 /* we dont need to ourself ourselves */
471 if (nodemap->nodes[j].pnn == pnn) {
474 /* dont check nodes that are unavailable */
475 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
479 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
480 mem_ctx, &remote_dbmap);
482 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
486 /* step through all databases on the remote node */
487 for (db=0; db<remote_dbmap->num;db++) {
490 for (i=0;i<(*dbmap)->num;i++) {
491 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
495 /* we already have this db locally */
496 if (i!=(*dbmap)->num) {
499 /* ok so we need to create this database and
502 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
503 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
505 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
506 nodemap->nodes[j].pnn));
509 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
510 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
512 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
515 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
517 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
528 pull the remote database contents from one node into the recdb
530 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
531 struct tdb_wrap *recdb, uint32_t dbid,
536 struct ctdb_marshall_buffer *reply;
537 struct ctdb_rec_data *rec;
539 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
541 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
542 CONTROL_TIMEOUT(), &outdata);
544 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
545 talloc_free(tmp_ctx);
549 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
551 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
552 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
553 talloc_free(tmp_ctx);
557 rec = (struct ctdb_rec_data *)&reply->data[0];
561 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
563 struct ctdb_ltdb_header *hdr;
566 key.dptr = &rec->data[0];
567 key.dsize = rec->keylen;
568 data.dptr = &rec->data[key.dsize];
569 data.dsize = rec->datalen;
571 hdr = (struct ctdb_ltdb_header *)data.dptr;
573 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
574 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
575 talloc_free(tmp_ctx);
579 /* fetch the existing record, if any */
580 existing = tdb_fetch(recdb->tdb, key);
582 if (existing.dptr != NULL) {
583 struct ctdb_ltdb_header header;
584 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
585 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
586 (unsigned)existing.dsize, srcnode));
588 talloc_free(tmp_ctx);
591 header = *(struct ctdb_ltdb_header *)existing.dptr;
593 if (!(header.rsn < hdr->rsn ||
594 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
599 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
600 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
601 talloc_free(tmp_ctx);
606 talloc_free(tmp_ctx);
612 pull all the remote database contents into the recdb
614 static int pull_remote_database(struct ctdb_context *ctdb,
615 struct ctdb_recoverd *rec,
616 struct ctdb_node_map *nodemap,
617 struct tdb_wrap *recdb, uint32_t dbid,
622 /* pull all records from all other nodes across onto this node
623 (this merges based on rsn)
625 for (j=0; j<nodemap->num; j++) {
626 /* dont merge from nodes that are unavailable */
627 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
630 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
631 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
632 nodemap->nodes[j].pnn));
633 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
643 update flags on all active nodes
645 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
649 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
651 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
659 ensure all nodes have the same vnnmap we do
661 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
662 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
666 /* push the new vnn map out to all the nodes */
667 for (j=0; j<nodemap->num; j++) {
668 /* dont push to nodes that are unavailable */
669 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
673 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
675 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
685 struct vacuum_info *next, *prev;
686 struct ctdb_recoverd *rec;
688 struct ctdb_db_context *ctdb_db;
689 struct ctdb_marshall_buffer *recs;
690 struct ctdb_rec_data *r;
693 static void vacuum_fetch_next(struct vacuum_info *v);
696 called when a vacuum fetch has completed - just free it and do the next one
698 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
700 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
702 vacuum_fetch_next(v);
707 process the next element from the vacuum list
709 static void vacuum_fetch_next(struct vacuum_info *v)
711 struct ctdb_call call;
712 struct ctdb_rec_data *r;
714 while (v->recs->count) {
715 struct ctdb_client_call_state *state;
717 struct ctdb_ltdb_header *hdr;
720 call.call_id = CTDB_NULL_FUNC;
721 call.flags = CTDB_IMMEDIATE_MIGRATION;
722 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
725 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
728 call.key.dptr = &r->data[0];
729 call.key.dsize = r->keylen;
731 /* ensure we don't block this daemon - just skip a record if we can't get
733 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
737 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
738 if (data.dptr == NULL) {
739 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
743 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
745 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
749 hdr = (struct ctdb_ltdb_header *)data.dptr;
750 if (hdr->dmaster == v->rec->ctdb->pnn) {
751 /* its already local */
753 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
759 state = ctdb_call_send(v->ctdb_db, &call);
760 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
762 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
766 state->async.fn = vacuum_fetch_callback;
767 state->async.private_data = v;
776 destroy a vacuum info structure
778 static int vacuum_info_destructor(struct vacuum_info *v)
780 DLIST_REMOVE(v->rec->vacuum_info, v);
786 handler for vacuum fetch
788 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
789 TDB_DATA data, void *private_data)
791 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
792 struct ctdb_marshall_buffer *recs;
794 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
796 struct ctdb_dbid_map *dbmap=NULL;
797 bool persistent = false;
798 struct ctdb_db_context *ctdb_db;
799 struct ctdb_rec_data *r;
801 struct vacuum_info *v;
803 recs = (struct ctdb_marshall_buffer *)data.dptr;
804 r = (struct ctdb_rec_data *)&recs->data[0];
806 if (recs->count == 0) {
807 talloc_free(tmp_ctx);
813 for (v=rec->vacuum_info;v;v=v->next) {
814 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
815 /* we're already working on records from this node */
816 talloc_free(tmp_ctx);
821 /* work out if the database is persistent */
822 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
824 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
825 talloc_free(tmp_ctx);
829 for (i=0;i<dbmap->num;i++) {
830 if (dbmap->dbs[i].dbid == recs->db_id) {
831 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
835 if (i == dbmap->num) {
836 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
837 talloc_free(tmp_ctx);
841 /* find the name of this database */
842 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
843 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
844 talloc_free(tmp_ctx);
849 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
850 if (ctdb_db == NULL) {
851 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
852 talloc_free(tmp_ctx);
856 v = talloc_zero(rec, struct vacuum_info);
858 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
859 talloc_free(tmp_ctx);
864 v->srcnode = srcnode;
865 v->ctdb_db = ctdb_db;
866 v->recs = talloc_memdup(v, recs, data.dsize);
867 if (v->recs == NULL) {
868 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
870 talloc_free(tmp_ctx);
873 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
875 DLIST_ADD(rec->vacuum_info, v);
877 talloc_set_destructor(v, vacuum_info_destructor);
879 vacuum_fetch_next(v);
880 talloc_free(tmp_ctx);
885 called when ctdb_wait_timeout should finish
887 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
888 struct timeval yt, void *p)
890 uint32_t *timed_out = (uint32_t *)p;
895 wait for a given number of seconds
897 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
899 uint32_t timed_out = 0;
900 time_t usecs = (secs - (time_t)secs) * 1000000;
901 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
903 event_loop_once(ctdb->ev);
908 called when an election times out (ends)
910 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
911 struct timeval t, void *p)
913 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
914 rec->election_timeout = NULL;
917 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
922 wait for an election to finish. It finished election_timeout seconds after
923 the last election packet is received
925 static void ctdb_wait_election(struct ctdb_recoverd *rec)
927 struct ctdb_context *ctdb = rec->ctdb;
928 while (rec->election_timeout) {
929 event_loop_once(ctdb->ev);
934 Update our local flags from all remote connected nodes.
935 This is only run when we are or we belive we are the recovery master
937 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
940 struct ctdb_context *ctdb = rec->ctdb;
941 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
943 /* get the nodemap for all active remote nodes and verify
944 they are the same as for this node
946 for (j=0; j<nodemap->num; j++) {
947 struct ctdb_node_map *remote_nodemap=NULL;
950 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
953 if (nodemap->nodes[j].pnn == ctdb->pnn) {
957 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
958 mem_ctx, &remote_nodemap);
960 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
961 nodemap->nodes[j].pnn));
962 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
963 talloc_free(mem_ctx);
964 return MONITOR_FAILED;
966 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
967 /* We should tell our daemon about this so it
968 updates its flags or else we will log the same
969 message again in the next iteration of recovery.
970 Since we are the recovery master we can just as
971 well update the flags on all nodes.
973 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
975 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
979 /* Update our local copy of the flags in the recovery
982 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
983 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
984 nodemap->nodes[j].flags));
985 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
987 talloc_free(remote_nodemap);
989 talloc_free(mem_ctx);
994 /* Create a new random generation ip.
995 The generation id can not be the INVALID_GENERATION id
997 static uint32_t new_generation(void)
1002 generation = random();
1004 if (generation != INVALID_GENERATION) {
1014 create a temporary working database
1016 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1019 struct tdb_wrap *recdb;
1022 /* open up the temporary recovery database */
1023 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1024 ctdb->db_directory_state,
1031 tdb_flags = TDB_NOLOCK;
1032 if (ctdb->valgrinding) {
1033 tdb_flags |= TDB_NOMMAP;
1035 tdb_flags |= TDB_DISALLOW_NESTING;
1037 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1038 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1039 if (recdb == NULL) {
1040 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1050 a traverse function for pulling all relevent records from recdb
1053 struct ctdb_context *ctdb;
1054 struct ctdb_marshall_buffer *recdata;
1060 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1062 struct recdb_data *params = (struct recdb_data *)p;
1063 struct ctdb_rec_data *rec;
1064 struct ctdb_ltdb_header *hdr;
1066 /* skip empty records */
1067 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1071 /* update the dmaster field to point to us */
1072 hdr = (struct ctdb_ltdb_header *)data.dptr;
1073 if (!params->persistent) {
1074 hdr->dmaster = params->ctdb->pnn;
1075 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1078 /* add the record to the blob ready to send to the nodes */
1079 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1081 params->failed = true;
1084 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1085 if (params->recdata == NULL) {
1086 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1087 rec->length + params->len, params->recdata->count));
1088 params->failed = true;
1091 params->recdata->count++;
1092 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1093 params->len += rec->length;
1100 push the recdb database out to all nodes
1102 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1104 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1106 struct recdb_data params;
1107 struct ctdb_marshall_buffer *recdata;
1109 TALLOC_CTX *tmp_ctx;
1112 tmp_ctx = talloc_new(ctdb);
1113 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1115 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1116 CTDB_NO_MEMORY(ctdb, recdata);
1118 recdata->db_id = dbid;
1121 params.recdata = recdata;
1122 params.len = offsetof(struct ctdb_marshall_buffer, data);
1123 params.failed = false;
1124 params.persistent = persistent;
1126 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1127 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1128 talloc_free(params.recdata);
1129 talloc_free(tmp_ctx);
1133 if (params.failed) {
1134 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1135 talloc_free(params.recdata);
1136 talloc_free(tmp_ctx);
1140 recdata = params.recdata;
1142 outdata.dptr = (void *)recdata;
1143 outdata.dsize = params.len;
1145 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1146 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1148 CONTROL_TIMEOUT(), false, outdata,
1151 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1152 talloc_free(recdata);
1153 talloc_free(tmp_ctx);
1157 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1158 dbid, recdata->count));
1160 talloc_free(recdata);
1161 talloc_free(tmp_ctx);
1168 go through a full recovery on one database
1170 static int recover_database(struct ctdb_recoverd *rec,
1171 TALLOC_CTX *mem_ctx,
1175 struct ctdb_node_map *nodemap,
1176 uint32_t transaction_id)
1178 struct tdb_wrap *recdb;
1180 struct ctdb_context *ctdb = rec->ctdb;
1182 struct ctdb_control_wipe_database w;
1185 recdb = create_recdb(ctdb, mem_ctx);
1186 if (recdb == NULL) {
1190 /* pull all remote databases onto the recdb */
1191 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1193 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1197 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1199 /* wipe all the remote databases. This is safe as we are in a transaction */
1201 w.transaction_id = transaction_id;
1203 data.dptr = (void *)&w;
1204 data.dsize = sizeof(w);
1206 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1207 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1209 CONTROL_TIMEOUT(), false, data,
1212 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1217 /* push out the correct database. This sets the dmaster and skips
1218 the empty records */
1219 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1225 /* all done with this database */
1232 reload the nodes file
1234 static void reload_nodes_file(struct ctdb_context *ctdb)
1237 ctdb_load_nodes_file(ctdb);
1240 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1241 struct ctdb_recoverd *rec,
1242 struct ctdb_node_map *nodemap,
1248 if (ctdb->num_nodes != nodemap->num) {
1249 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1250 ctdb->num_nodes, nodemap->num));
1252 *culprit = ctdb->pnn;
1257 for (j=0; j<nodemap->num; j++) {
1258 /* release any existing data */
1259 if (ctdb->nodes[j]->known_public_ips) {
1260 talloc_free(ctdb->nodes[j]->known_public_ips);
1261 ctdb->nodes[j]->known_public_ips = NULL;
1263 if (ctdb->nodes[j]->available_public_ips) {
1264 talloc_free(ctdb->nodes[j]->available_public_ips);
1265 ctdb->nodes[j]->available_public_ips = NULL;
1268 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1272 /* grab a new shiny list of public ips from the node */
1273 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1275 ctdb->nodes[j]->pnn,
1278 &ctdb->nodes[j]->known_public_ips);
1280 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1281 ctdb->nodes[j]->pnn));
1283 *culprit = ctdb->nodes[j]->pnn;
1288 if (ctdb->tunable.disable_ip_failover == 0) {
1289 if (rec->ip_check_disable_ctx == NULL) {
1290 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1291 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1292 rec->need_takeover_run = true;
1297 /* grab a new shiny list of public ips from the node */
1298 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1300 ctdb->nodes[j]->pnn,
1302 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1303 &ctdb->nodes[j]->available_public_ips);
1305 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1306 ctdb->nodes[j]->pnn));
1308 *culprit = ctdb->nodes[j]->pnn;
1317 /* when we start a recovery, make sure all nodes use the same reclock file
1320 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1322 struct ctdb_context *ctdb = rec->ctdb;
1323 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1327 if (ctdb->recovery_lock_file == NULL) {
1331 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1332 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1335 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1336 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1342 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1343 talloc_free(tmp_ctx);
1347 talloc_free(tmp_ctx);
1353 we are the recmaster, and recovery is needed - start a recovery run
1355 static int do_recovery(struct ctdb_recoverd *rec,
1356 TALLOC_CTX *mem_ctx, uint32_t pnn,
1357 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1359 struct ctdb_context *ctdb = rec->ctdb;
1361 uint32_t generation;
1362 struct ctdb_dbid_map *dbmap;
1365 struct timeval start_time;
1366 uint32_t culprit = (uint32_t)-1;
1368 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1370 /* if recovery fails, force it again */
1371 rec->need_recovery = true;
1373 for (i=0; i<ctdb->num_nodes; i++) {
1374 struct ctdb_banning_state *ban_state;
1376 if (ctdb->nodes[i]->ban_state == NULL) {
1379 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1380 if (ban_state->count < 2*ctdb->num_nodes) {
1383 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1384 ctdb->nodes[i]->pnn, ban_state->count,
1385 ctdb->tunable.recovery_ban_period));
1386 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1387 ban_state->count = 0;
1391 if (ctdb->tunable.verify_recovery_lock != 0) {
1392 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1393 start_time = timeval_current();
1394 if (!ctdb_recovery_lock(ctdb, true)) {
1395 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1396 "and ban ourself for %u seconds\n",
1397 ctdb->tunable.recovery_ban_period));
1398 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1401 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1402 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1405 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1407 /* get a list of all databases */
1408 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1410 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1414 /* we do the db creation before we set the recovery mode, so the freeze happens
1415 on all databases we will be dealing with. */
1417 /* verify that we have all the databases any other node has */
1418 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1420 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1424 /* verify that all other nodes have all our databases */
1425 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1427 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1430 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1432 /* update the database priority for all remote databases */
1433 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1435 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1437 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1440 /* update all other nodes to use the same setting for reclock files
1441 as the local recovery master.
1443 sync_recovery_lock_file_across_cluster(rec);
1445 /* set recovery mode to active on all nodes */
1446 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1448 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1452 /* execute the "startrecovery" event script on all nodes */
1453 ret = run_startrecovery_eventscript(rec, nodemap);
1455 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1460 update all nodes to have the same flags that we have
1462 for (i=0;i<nodemap->num;i++) {
1463 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1467 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1469 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1474 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1476 /* pick a new generation number */
1477 generation = new_generation();
1479 /* change the vnnmap on this node to use the new generation
1480 number but not on any other nodes.
1481 this guarantees that if we abort the recovery prematurely
1482 for some reason (a node stops responding?)
1483 that we can just return immediately and we will reenter
1484 recovery shortly again.
1485 I.e. we deliberately leave the cluster with an inconsistent
1486 generation id to allow us to abort recovery at any stage and
1487 just restart it from scratch.
1489 vnnmap->generation = generation;
1490 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1492 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1496 data.dptr = (void *)&generation;
1497 data.dsize = sizeof(uint32_t);
1499 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1500 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1502 CONTROL_TIMEOUT(), false, data,
1504 transaction_start_fail_callback,
1506 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1507 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1509 CONTROL_TIMEOUT(), false, tdb_null,
1513 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1518 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1520 for (i=0;i<dbmap->num;i++) {
1521 ret = recover_database(rec, mem_ctx,
1523 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1524 pnn, nodemap, generation);
1526 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1531 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1533 /* commit all the changes */
1534 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1536 CONTROL_TIMEOUT(), false, data,
1539 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1543 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1546 /* update the capabilities for all nodes */
1547 ret = update_capabilities(ctdb, nodemap);
1549 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1553 /* build a new vnn map with all the currently active and
1555 generation = new_generation();
1556 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1557 CTDB_NO_MEMORY(ctdb, vnnmap);
1558 vnnmap->generation = generation;
1560 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1561 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1562 for (i=j=0;i<nodemap->num;i++) {
1563 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1566 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1567 /* this node can not be an lmaster */
1568 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1573 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1574 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1575 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1578 if (vnnmap->size == 0) {
1579 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1581 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1582 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1583 vnnmap->map[0] = pnn;
1586 /* update to the new vnnmap on all nodes */
1587 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1589 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1593 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1595 /* update recmaster to point to us for all nodes */
1596 ret = set_recovery_master(ctdb, nodemap, pnn);
1598 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1602 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1605 update all nodes to have the same flags that we have
1607 for (i=0;i<nodemap->num;i++) {
1608 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1612 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1614 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1619 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1621 /* disable recovery mode */
1622 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1624 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1628 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1631 tell nodes to takeover their public IPs
1633 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1635 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1637 rec->need_takeover_run = true;
1640 rec->need_takeover_run = false;
1641 ret = ctdb_takeover_run(ctdb, nodemap);
1643 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1644 rec->need_takeover_run = true;
1647 /* execute the "recovered" event script on all nodes */
1648 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1650 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1654 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1656 /* send a message to all clients telling them that the cluster
1657 has been reconfigured */
1658 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1660 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1662 rec->need_recovery = false;
1664 /* we managed to complete a full recovery, make sure to forgive
1665 any past sins by the nodes that could now participate in the
1668 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1669 for (i=0;i<nodemap->num;i++) {
1670 struct ctdb_banning_state *ban_state;
1672 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1676 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1677 if (ban_state == NULL) {
1681 ban_state->count = 0;
1685 /* We just finished a recovery successfully.
1686 We now wait for rerecovery_timeout before we allow
1687 another recovery to take place.
1689 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1690 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1691 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1698 elections are won by first checking the number of connected nodes, then
1699 the priority time, then the pnn
1701 struct election_message {
1702 uint32_t num_connected;
1703 struct timeval priority_time;
1705 uint32_t node_flags;
1709 form this nodes election data
1711 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1714 struct ctdb_node_map *nodemap;
1715 struct ctdb_context *ctdb = rec->ctdb;
1719 em->pnn = rec->ctdb->pnn;
1720 em->priority_time = rec->priority_time;
1722 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1724 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1728 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1729 em->node_flags = rec->node_flags;
1731 for (i=0;i<nodemap->num;i++) {
1732 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1733 em->num_connected++;
1737 /* we shouldnt try to win this election if we cant be a recmaster */
1738 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1739 em->num_connected = 0;
1740 em->priority_time = timeval_current();
1743 talloc_free(nodemap);
1747 see if the given election data wins
1749 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1751 struct election_message myem;
1754 ctdb_election_data(rec, &myem);
1756 /* we cant win if we dont have the recmaster capability */
1757 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1761 /* we cant win if we are banned */
1762 if (rec->node_flags & NODE_FLAGS_BANNED) {
1766 /* we cant win if we are stopped */
1767 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1771 /* we will automatically win if the other node is banned */
1772 if (em->node_flags & NODE_FLAGS_BANNED) {
1776 /* we will automatically win if the other node is banned */
1777 if (em->node_flags & NODE_FLAGS_STOPPED) {
1781 /* try to use the most connected node */
1783 cmp = (int)myem.num_connected - (int)em->num_connected;
1786 /* then the longest running node */
1788 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1792 cmp = (int)myem.pnn - (int)em->pnn;
1799 send out an election request
1801 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1804 TDB_DATA election_data;
1805 struct election_message emsg;
1807 struct ctdb_context *ctdb = rec->ctdb;
1809 srvid = CTDB_SRVID_RECOVERY;
1811 ctdb_election_data(rec, &emsg);
1813 election_data.dsize = sizeof(struct election_message);
1814 election_data.dptr = (unsigned char *)&emsg;
1817 /* send an election message to all active nodes */
1818 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1819 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1822 /* A new node that is already frozen has entered the cluster.
1823 The existing nodes are not frozen and dont need to be frozen
1824 until the election has ended and we start the actual recovery
1826 if (update_recmaster == true) {
1827 /* first we assume we will win the election and set
1828 recoverymaster to be ourself on the current node
1830 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1832 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1842 this function will unban all nodes in the cluster
1844 static void unban_all_nodes(struct ctdb_context *ctdb)
1847 struct ctdb_node_map *nodemap;
1848 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1850 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1852 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1856 for (i=0;i<nodemap->num;i++) {
1857 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1858 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1859 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1863 talloc_free(tmp_ctx);
1868 we think we are winning the election - send a broadcast election request
1870 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1872 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1875 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1877 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1880 talloc_free(rec->send_election_te);
1881 rec->send_election_te = NULL;
1885 handler for memory dumps
1887 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1888 TDB_DATA data, void *private_data)
1890 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1893 struct rd_memdump_reply *rd;
1895 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1896 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1897 talloc_free(tmp_ctx);
1900 rd = (struct rd_memdump_reply *)data.dptr;
1902 dump = talloc_zero(tmp_ctx, TDB_DATA);
1904 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1905 talloc_free(tmp_ctx);
1908 ret = ctdb_dump_memory(ctdb, dump);
1910 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1911 talloc_free(tmp_ctx);
1915 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1917 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1919 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1920 talloc_free(tmp_ctx);
1924 talloc_free(tmp_ctx);
1928 handler for reload_nodes
1930 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1931 TDB_DATA data, void *private_data)
1933 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1935 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1937 reload_nodes_file(rec->ctdb);
1941 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1942 struct timeval yt, void *p)
1944 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1946 talloc_free(rec->ip_check_disable_ctx);
1947 rec->ip_check_disable_ctx = NULL;
1951 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1952 TDB_DATA data, void *private_data)
1954 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1955 struct ctdb_public_ip *ip;
1957 if (rec->recmaster != rec->ctdb->pnn) {
1958 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
1962 if (data.dsize != sizeof(struct ctdb_public_ip)) {
1963 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
1967 ip = (struct ctdb_public_ip *)data.dptr;
1969 update_ip_assignment_tree(rec->ctdb, ip);
1973 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1974 TDB_DATA data, void *private_data)
1976 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1979 if (rec->ip_check_disable_ctx != NULL) {
1980 talloc_free(rec->ip_check_disable_ctx);
1981 rec->ip_check_disable_ctx = NULL;
1984 if (data.dsize != sizeof(uint32_t)) {
1985 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1986 "expexting %lu\n", (long unsigned)data.dsize,
1987 (long unsigned)sizeof(uint32_t)));
1990 if (data.dptr == NULL) {
1991 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1995 timeout = *((uint32_t *)data.dptr);
1996 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1998 rec->ip_check_disable_ctx = talloc_new(rec);
1999 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2001 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2006 handler for ip reallocate, just add it to the list of callers and
2007 handle this later in the monitor_cluster loop so we do not recurse
2008 with other callers to takeover_run()
2010 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2011 TDB_DATA data, void *private_data)
2013 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2014 struct ip_reallocate_list *caller;
2016 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2017 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2021 if (rec->ip_reallocate_ctx == NULL) {
2022 rec->ip_reallocate_ctx = talloc_new(rec);
2023 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2026 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2027 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2029 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2030 caller->next = rec->reallocate_callers;
2031 rec->reallocate_callers = caller;
2036 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2038 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2041 struct ip_reallocate_list *callers;
2044 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2046 /* update the list of public ips that a node can handle for
2049 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2051 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2053 rec->need_takeover_run = true;
2056 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2058 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2059 rec->need_takeover_run = true;
2063 result.dsize = sizeof(int32_t);
2064 result.dptr = (uint8_t *)&ret;
2066 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2068 /* Someone that sent srvid==0 does not want a reply */
2069 if (callers->rd->srvid == 0) {
2072 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2073 "%u:%llu\n", (unsigned)callers->rd->pnn,
2074 (unsigned long long)callers->rd->srvid));
2075 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2077 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2078 "message to %u:%llu\n",
2079 (unsigned)callers->rd->pnn,
2080 (unsigned long long)callers->rd->srvid));
2084 talloc_free(tmp_ctx);
2085 talloc_free(rec->ip_reallocate_ctx);
2086 rec->ip_reallocate_ctx = NULL;
2087 rec->reallocate_callers = NULL;
2093 handler for recovery master elections
2095 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2096 TDB_DATA data, void *private_data)
2098 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2100 struct election_message *em = (struct election_message *)data.dptr;
2101 TALLOC_CTX *mem_ctx;
2103 /* we got an election packet - update the timeout for the election */
2104 talloc_free(rec->election_timeout);
2105 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2107 timeval_current_ofs(0, 500000) :
2108 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2109 ctdb_election_timeout, rec);
2111 mem_ctx = talloc_new(ctdb);
2113 /* someone called an election. check their election data
2114 and if we disagree and we would rather be the elected node,
2115 send a new election message to all other nodes
2117 if (ctdb_election_win(rec, em)) {
2118 if (!rec->send_election_te) {
2119 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2120 timeval_current_ofs(0, 500000),
2121 election_send_request, rec);
2123 talloc_free(mem_ctx);
2124 /*unban_all_nodes(ctdb);*/
2129 talloc_free(rec->send_election_te);
2130 rec->send_election_te = NULL;
2132 if (ctdb->tunable.verify_recovery_lock != 0) {
2133 /* release the recmaster lock */
2134 if (em->pnn != ctdb->pnn &&
2135 ctdb->recovery_lock_fd != -1) {
2136 close(ctdb->recovery_lock_fd);
2137 ctdb->recovery_lock_fd = -1;
2138 unban_all_nodes(ctdb);
2142 /* ok, let that guy become recmaster then */
2143 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2145 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2146 talloc_free(mem_ctx);
2150 talloc_free(mem_ctx);
2156 force the start of the election process
2158 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2159 struct ctdb_node_map *nodemap)
2162 struct ctdb_context *ctdb = rec->ctdb;
2164 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2166 /* set all nodes to recovery mode to stop all internode traffic */
2167 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2169 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2173 talloc_free(rec->election_timeout);
2174 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2176 timeval_current_ofs(0, 500000) :
2177 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2178 ctdb_election_timeout, rec);
2180 ret = send_election_request(rec, pnn, true);
2182 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2186 /* wait for a few seconds to collect all responses */
2187 ctdb_wait_election(rec);
2193 handler for when a node changes its flags
2195 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2196 TDB_DATA data, void *private_data)
2199 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2200 struct ctdb_node_map *nodemap=NULL;
2201 TALLOC_CTX *tmp_ctx;
2203 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2204 int disabled_flag_changed;
2206 if (data.dsize != sizeof(*c)) {
2207 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2211 tmp_ctx = talloc_new(ctdb);
2212 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2214 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2216 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2217 talloc_free(tmp_ctx);
2222 for (i=0;i<nodemap->num;i++) {
2223 if (nodemap->nodes[i].pnn == c->pnn) break;
2226 if (i == nodemap->num) {
2227 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2228 talloc_free(tmp_ctx);
2232 if (nodemap->nodes[i].flags != c->new_flags) {
2233 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2236 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2238 nodemap->nodes[i].flags = c->new_flags;
2240 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2241 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2244 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2245 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2249 ctdb->recovery_master == ctdb->pnn &&
2250 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2251 /* Only do the takeover run if the perm disabled or unhealthy
2252 flags changed since these will cause an ip failover but not
2254 If the node became disconnected or banned this will also
2255 lead to an ip address failover but that is handled
2258 if (disabled_flag_changed) {
2259 rec->need_takeover_run = true;
2263 talloc_free(tmp_ctx);
2267 handler for when we need to push out flag changes ot all other nodes
2269 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2270 TDB_DATA data, void *private_data)
2273 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2274 struct ctdb_node_map *nodemap=NULL;
2275 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2279 /* find the recovery master */
2280 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2282 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2283 talloc_free(tmp_ctx);
2287 /* read the node flags from the recmaster */
2288 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2290 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2291 talloc_free(tmp_ctx);
2294 if (c->pnn >= nodemap->num) {
2295 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2296 talloc_free(tmp_ctx);
2300 /* send the flags update to all connected nodes */
2301 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2303 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2304 nodes, 0, CONTROL_TIMEOUT(),
2308 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2310 talloc_free(tmp_ctx);
2314 talloc_free(tmp_ctx);
2318 struct verify_recmode_normal_data {
2320 enum monitor_result status;
2323 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2325 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2328 /* one more node has responded with recmode data*/
2331 /* if we failed to get the recmode, then return an error and let
2332 the main loop try again.
2334 if (state->state != CTDB_CONTROL_DONE) {
2335 if (rmdata->status == MONITOR_OK) {
2336 rmdata->status = MONITOR_FAILED;
2341 /* if we got a response, then the recmode will be stored in the
2344 if (state->status != CTDB_RECOVERY_NORMAL) {
2345 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2346 rmdata->status = MONITOR_RECOVERY_NEEDED;
2353 /* verify that all nodes are in normal recovery mode */
2354 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2356 struct verify_recmode_normal_data *rmdata;
2357 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2358 struct ctdb_client_control_state *state;
2359 enum monitor_result status;
2362 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2363 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2365 rmdata->status = MONITOR_OK;
2367 /* loop over all active nodes and send an async getrecmode call to
2369 for (j=0; j<nodemap->num; j++) {
2370 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2373 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2375 nodemap->nodes[j].pnn);
2376 if (state == NULL) {
2377 /* we failed to send the control, treat this as
2378 an error and try again next iteration
2380 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2381 talloc_free(mem_ctx);
2382 return MONITOR_FAILED;
2385 /* set up the callback functions */
2386 state->async.fn = verify_recmode_normal_callback;
2387 state->async.private_data = rmdata;
2389 /* one more control to wait for to complete */
2394 /* now wait for up to the maximum number of seconds allowed
2395 or until all nodes we expect a response from has replied
2397 while (rmdata->count > 0) {
2398 event_loop_once(ctdb->ev);
2401 status = rmdata->status;
2402 talloc_free(mem_ctx);
2407 struct verify_recmaster_data {
2408 struct ctdb_recoverd *rec;
2411 enum monitor_result status;
2414 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2416 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2419 /* one more node has responded with recmaster data*/
2422 /* if we failed to get the recmaster, then return an error and let
2423 the main loop try again.
2425 if (state->state != CTDB_CONTROL_DONE) {
2426 if (rmdata->status == MONITOR_OK) {
2427 rmdata->status = MONITOR_FAILED;
2432 /* if we got a response, then the recmaster will be stored in the
2435 if (state->status != rmdata->pnn) {
2436 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2437 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2438 rmdata->status = MONITOR_ELECTION_NEEDED;
2445 /* verify that all nodes agree that we are the recmaster */
2446 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2448 struct ctdb_context *ctdb = rec->ctdb;
2449 struct verify_recmaster_data *rmdata;
2450 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2451 struct ctdb_client_control_state *state;
2452 enum monitor_result status;
2455 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2456 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2460 rmdata->status = MONITOR_OK;
2462 /* loop over all active nodes and send an async getrecmaster call to
2464 for (j=0; j<nodemap->num; j++) {
2465 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2468 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2470 nodemap->nodes[j].pnn);
2471 if (state == NULL) {
2472 /* we failed to send the control, treat this as
2473 an error and try again next iteration
2475 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2476 talloc_free(mem_ctx);
2477 return MONITOR_FAILED;
2480 /* set up the callback functions */
2481 state->async.fn = verify_recmaster_callback;
2482 state->async.private_data = rmdata;
2484 /* one more control to wait for to complete */
2489 /* now wait for up to the maximum number of seconds allowed
2490 or until all nodes we expect a response from has replied
2492 while (rmdata->count > 0) {
2493 event_loop_once(ctdb->ev);
2496 status = rmdata->status;
2497 talloc_free(mem_ctx);
2502 /* called to check that the local allocation of public ip addresses is ok.
2504 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2506 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2507 struct ctdb_control_get_ifaces *ifaces = NULL;
2508 struct ctdb_all_public_ips *ips = NULL;
2509 struct ctdb_uptime *uptime1 = NULL;
2510 struct ctdb_uptime *uptime2 = NULL;
2512 bool need_iface_check = false;
2513 bool need_takeover_run = false;
2515 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2516 CTDB_CURRENT_NODE, &uptime1);
2518 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2519 talloc_free(mem_ctx);
2524 /* read the interfaces from the local node */
2525 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2527 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2528 talloc_free(mem_ctx);
2533 need_iface_check = true;
2534 } else if (rec->ifaces->num != ifaces->num) {
2535 need_iface_check = true;
2536 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2537 need_iface_check = true;
2540 if (need_iface_check) {
2541 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2542 "local node %u - force takeover run\n",
2544 need_takeover_run = true;
2547 /* read the ip allocation from the local node */
2548 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2550 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2551 talloc_free(mem_ctx);
2555 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2556 CTDB_CURRENT_NODE, &uptime2);
2558 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2559 talloc_free(mem_ctx);
2563 /* skip the check if the startrecovery time has changed */
2564 if (timeval_compare(&uptime1->last_recovery_started,
2565 &uptime2->last_recovery_started) != 0) {
2566 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2567 talloc_free(mem_ctx);
2571 /* skip the check if the endrecovery time has changed */
2572 if (timeval_compare(&uptime1->last_recovery_finished,
2573 &uptime2->last_recovery_finished) != 0) {
2574 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2575 talloc_free(mem_ctx);
2579 /* skip the check if we have started but not finished recovery */
2580 if (timeval_compare(&uptime1->last_recovery_finished,
2581 &uptime1->last_recovery_started) != 1) {
2582 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2583 talloc_free(mem_ctx);
2588 talloc_free(rec->ifaces);
2589 rec->ifaces = talloc_steal(rec, ifaces);
2591 /* verify that we have the ip addresses we should have
2592 and we dont have ones we shouldnt have.
2593 if we find an inconsistency we set recmode to
2594 active on the local node and wait for the recmaster
2595 to do a full blown recovery.
2596 also if the pnn is -1 and we are healthy and can host the ip
2597 we also request a ip reallocation.
2599 if (ctdb->tunable.disable_ip_failover == 0) {
2600 for (j=0; j<ips->num; j++) {
2601 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2602 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2603 ctdb_addr_to_str(&ips->ips[j].addr)));
2604 need_takeover_run = true;
2605 } else if (ips->ips[j].pnn == pnn) {
2606 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2607 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2608 ctdb_addr_to_str(&ips->ips[j].addr)));
2609 need_takeover_run = true;
2612 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2613 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2614 ctdb_addr_to_str(&ips->ips[j].addr)));
2615 need_takeover_run = true;
2621 if (need_takeover_run) {
2622 struct takeover_run_reply rd;
2625 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2629 data.dptr = (uint8_t *)&rd;
2630 data.dsize = sizeof(rd);
2632 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2634 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2637 talloc_free(mem_ctx);
2642 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2644 struct ctdb_node_map **remote_nodemaps = callback_data;
2646 if (node_pnn >= ctdb->num_nodes) {
2647 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2651 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2655 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2656 struct ctdb_node_map *nodemap,
2657 struct ctdb_node_map **remote_nodemaps)
2661 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2662 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2664 CONTROL_TIMEOUT(), false, tdb_null,
2665 async_getnodemap_callback,
2667 remote_nodemaps) != 0) {
2668 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2676 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2677 struct ctdb_check_reclock_state {
2678 struct ctdb_context *ctdb;
2679 struct timeval start_time;
2682 struct timed_event *te;
2683 struct fd_event *fde;
2684 enum reclock_child_status status;
2687 /* when we free the reclock state we must kill any child process.
2689 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2691 struct ctdb_context *ctdb = state->ctdb;
2693 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2695 if (state->fd[0] != -1) {
2696 close(state->fd[0]);
2699 if (state->fd[1] != -1) {
2700 close(state->fd[1]);
2703 kill(state->child, SIGKILL);
2708 called if our check_reclock child times out. this would happen if
2709 i/o to the reclock file blocks.
2711 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2712 struct timeval t, void *private_data)
2714 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2715 struct ctdb_check_reclock_state);
2717 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2718 state->status = RECLOCK_TIMEOUT;
2721 /* this is called when the child process has completed checking the reclock
2722 file and has written data back to us through the pipe.
2724 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2725 uint16_t flags, void *private_data)
2727 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2728 struct ctdb_check_reclock_state);
2732 /* we got a response from our child process so we can abort the
2735 talloc_free(state->te);
2738 ret = read(state->fd[0], &c, 1);
2739 if (ret != 1 || c != RECLOCK_OK) {
2740 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2741 state->status = RECLOCK_FAILED;
2746 state->status = RECLOCK_OK;
2750 static int check_recovery_lock(struct ctdb_context *ctdb)
2753 struct ctdb_check_reclock_state *state;
2754 pid_t parent = getpid();
2756 if (ctdb->recovery_lock_fd == -1) {
2757 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2761 state = talloc(ctdb, struct ctdb_check_reclock_state);
2762 CTDB_NO_MEMORY(ctdb, state);
2765 state->start_time = timeval_current();
2766 state->status = RECLOCK_CHECKING;
2770 ret = pipe(state->fd);
2773 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2777 state->child = ctdb_fork(ctdb);
2778 if (state->child == (pid_t)-1) {
2779 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2780 close(state->fd[0]);
2782 close(state->fd[1]);
2788 if (state->child == 0) {
2789 char cc = RECLOCK_OK;
2790 close(state->fd[0]);
2793 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
2794 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2795 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2796 cc = RECLOCK_FAILED;
2799 write(state->fd[1], &cc, 1);
2800 /* make sure we die when our parent dies */
2801 while (kill(parent, 0) == 0 || errno != ESRCH) {
2803 write(state->fd[1], &cc, 1);
2807 close(state->fd[1]);
2809 set_close_on_exec(state->fd[0]);
2811 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2813 talloc_set_destructor(state, check_reclock_destructor);
2815 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2816 ctdb_check_reclock_timeout, state);
2817 if (state->te == NULL) {
2818 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2823 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2825 reclock_child_handler,
2828 if (state->fde == NULL) {
2829 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2833 tevent_fd_set_auto_close(state->fde);
2835 while (state->status == RECLOCK_CHECKING) {
2836 event_loop_once(ctdb->ev);
2839 if (state->status == RECLOCK_FAILED) {
2840 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2841 close(ctdb->recovery_lock_fd);
2842 ctdb->recovery_lock_fd = -1;
2851 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2853 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2854 const char *reclockfile;
2856 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2857 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2858 talloc_free(tmp_ctx);
2862 if (reclockfile == NULL) {
2863 if (ctdb->recovery_lock_file != NULL) {
2864 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2865 talloc_free(ctdb->recovery_lock_file);
2866 ctdb->recovery_lock_file = NULL;
2867 if (ctdb->recovery_lock_fd != -1) {
2868 close(ctdb->recovery_lock_fd);
2869 ctdb->recovery_lock_fd = -1;
2872 ctdb->tunable.verify_recovery_lock = 0;
2873 talloc_free(tmp_ctx);
2877 if (ctdb->recovery_lock_file == NULL) {
2878 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2879 if (ctdb->recovery_lock_fd != -1) {
2880 close(ctdb->recovery_lock_fd);
2881 ctdb->recovery_lock_fd = -1;
2883 talloc_free(tmp_ctx);
2888 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2889 talloc_free(tmp_ctx);
2893 talloc_free(ctdb->recovery_lock_file);
2894 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2895 ctdb->tunable.verify_recovery_lock = 0;
2896 if (ctdb->recovery_lock_fd != -1) {
2897 close(ctdb->recovery_lock_fd);
2898 ctdb->recovery_lock_fd = -1;
2901 talloc_free(tmp_ctx);
2905 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2906 TALLOC_CTX *mem_ctx)
2909 struct ctdb_node_map *nodemap=NULL;
2910 struct ctdb_node_map *recmaster_nodemap=NULL;
2911 struct ctdb_node_map **remote_nodemaps=NULL;
2912 struct ctdb_vnn_map *vnnmap=NULL;
2913 struct ctdb_vnn_map *remote_vnnmap=NULL;
2914 int32_t debug_level;
2919 /* verify that the main daemon is still running */
2920 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2921 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2925 /* ping the local daemon to tell it we are alive */
2926 ctdb_ctrl_recd_ping(ctdb);
2928 if (rec->election_timeout) {
2929 /* an election is in progress */
2933 /* read the debug level from the parent and update locally */
2934 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2936 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2939 LogLevel = debug_level;
2942 /* We must check if we need to ban a node here but we want to do this
2943 as early as possible so we dont wait until we have pulled the node
2944 map from the local node. thats why we have the hardcoded value 20
2946 for (i=0; i<ctdb->num_nodes; i++) {
2947 struct ctdb_banning_state *ban_state;
2949 if (ctdb->nodes[i]->ban_state == NULL) {
2952 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2953 if (ban_state->count < 20) {
2956 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2957 ctdb->nodes[i]->pnn, ban_state->count,
2958 ctdb->tunable.recovery_ban_period));
2959 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2960 ban_state->count = 0;
2963 /* get relevant tunables */
2964 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2966 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2970 /* get the current recovery lock file from the server */
2971 if (update_recovery_lock_file(ctdb) != 0) {
2972 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2976 /* Make sure that if recovery lock verification becomes disabled when
2979 if (ctdb->tunable.verify_recovery_lock == 0) {
2980 if (ctdb->recovery_lock_fd != -1) {
2981 close(ctdb->recovery_lock_fd);
2982 ctdb->recovery_lock_fd = -1;
2986 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2987 if (pnn == (uint32_t)-1) {
2988 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2992 /* get the vnnmap */
2993 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2995 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3000 /* get number of nodes */
3002 talloc_free(rec->nodemap);
3003 rec->nodemap = NULL;
3006 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3008 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3011 nodemap = rec->nodemap;
3013 /* update the capabilities for all nodes */
3014 ret = update_capabilities(ctdb, nodemap);
3016 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3020 /* check which node is the recovery master */
3021 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3023 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3027 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3028 if (rec->recmaster != pnn) {
3029 if (rec->ip_reallocate_ctx != NULL) {
3030 talloc_free(rec->ip_reallocate_ctx);
3031 rec->ip_reallocate_ctx = NULL;
3032 rec->reallocate_callers = NULL;
3036 if (rec->recmaster == (uint32_t)-1) {
3037 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3038 force_election(rec, pnn, nodemap);
3042 /* if the local daemon is STOPPED, we verify that the databases are
3043 also frozen and thet the recmode is set to active
3045 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3046 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3048 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3050 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3051 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3053 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3055 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3058 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3060 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3067 /* If the local node is stopped, verify we are not the recmaster
3068 and yield this role if so
3070 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3071 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3072 force_election(rec, pnn, nodemap);
3077 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3078 * but we have force an election and try to become the new
3081 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3082 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3083 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3084 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3085 " but we (node %u) have - force an election\n",
3086 rec->recmaster, pnn));
3087 force_election(rec, pnn, nodemap);
3091 /* check that we (recovery daemon) and the local ctdb daemon
3092 agrees on whether we are banned or not
3096 /* remember our own node flags */
3097 rec->node_flags = nodemap->nodes[pnn].flags;
3099 /* count how many active nodes there are */
3100 rec->num_active = 0;
3101 rec->num_connected = 0;
3102 for (i=0; i<nodemap->num; i++) {
3103 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3106 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3107 rec->num_connected++;
3112 /* verify that the recmaster node is still active */
3113 for (j=0; j<nodemap->num; j++) {
3114 if (nodemap->nodes[j].pnn==rec->recmaster) {
3119 if (j == nodemap->num) {
3120 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3121 force_election(rec, pnn, nodemap);
3125 /* if recovery master is disconnected we must elect a new recmaster */
3126 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3127 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3128 force_election(rec, pnn, nodemap);
3132 /* grap the nodemap from the recovery master to check if it is banned */
3133 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3134 mem_ctx, &recmaster_nodemap);
3136 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3137 nodemap->nodes[j].pnn));
3142 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3143 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3144 force_election(rec, pnn, nodemap);
3149 /* verify that we have all ip addresses we should have and we dont
3150 * have addresses we shouldnt have.
3152 if (ctdb->tunable.disable_ip_failover == 0) {
3153 if (rec->ip_check_disable_ctx == NULL) {
3154 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3155 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3161 /* if we are not the recmaster then we do not need to check
3162 if recovery is needed
3164 if (pnn != rec->recmaster) {
3169 /* ensure our local copies of flags are right */
3170 ret = update_local_flags(rec, nodemap);
3171 if (ret == MONITOR_ELECTION_NEEDED) {
3172 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3173 force_election(rec, pnn, nodemap);
3176 if (ret != MONITOR_OK) {
3177 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3181 if (ctdb->num_nodes != nodemap->num) {
3182 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3183 reload_nodes_file(ctdb);
3187 /* verify that all active nodes agree that we are the recmaster */
3188 switch (verify_recmaster(rec, nodemap, pnn)) {
3189 case MONITOR_RECOVERY_NEEDED:
3190 /* can not happen */
3192 case MONITOR_ELECTION_NEEDED:
3193 force_election(rec, pnn, nodemap);
3197 case MONITOR_FAILED:
3202 if (rec->need_recovery) {
3203 /* a previous recovery didn't finish */
3204 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3208 /* verify that all active nodes are in normal mode
3209 and not in recovery mode
3211 switch (verify_recmode(ctdb, nodemap)) {
3212 case MONITOR_RECOVERY_NEEDED:
3213 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3215 case MONITOR_FAILED:
3217 case MONITOR_ELECTION_NEEDED:
3218 /* can not happen */
3224 if (ctdb->tunable.verify_recovery_lock != 0) {
3225 /* we should have the reclock - check its not stale */
3226 ret = check_recovery_lock(ctdb);
3228 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3229 ctdb_set_culprit(rec, ctdb->pnn);
3230 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3235 /* if there are takeovers requested, perform it and notify the waiters */
3236 if (rec->reallocate_callers) {
3237 process_ipreallocate_requests(ctdb, rec);
3240 /* get the nodemap for all active remote nodes
3242 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3243 if (remote_nodemaps == NULL) {
3244 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3247 for(i=0; i<nodemap->num; i++) {
3248 remote_nodemaps[i] = NULL;
3250 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3251 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3255 /* verify that all other nodes have the same nodemap as we have
3257 for (j=0; j<nodemap->num; j++) {
3258 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3262 if (remote_nodemaps[j] == NULL) {
3263 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3264 ctdb_set_culprit(rec, j);
3269 /* if the nodes disagree on how many nodes there are
3270 then this is a good reason to try recovery
3272 if (remote_nodemaps[j]->num != nodemap->num) {
3273 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3274 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3275 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3276 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3280 /* if the nodes disagree on which nodes exist and are
3281 active, then that is also a good reason to do recovery
3283 for (i=0;i<nodemap->num;i++) {
3284 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3285 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3286 nodemap->nodes[j].pnn, i,
3287 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3288 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3289 do_recovery(rec, mem_ctx, pnn, nodemap,
3295 /* verify the flags are consistent
3297 for (i=0; i<nodemap->num; i++) {
3298 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3302 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3303 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3304 nodemap->nodes[j].pnn,
3305 nodemap->nodes[i].pnn,
3306 remote_nodemaps[j]->nodes[i].flags,
3307 nodemap->nodes[j].flags));
3309 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3310 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3311 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3312 do_recovery(rec, mem_ctx, pnn, nodemap,
3316 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3317 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3318 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3319 do_recovery(rec, mem_ctx, pnn, nodemap,
3328 /* there better be the same number of lmasters in the vnn map
3329 as there are active nodes or we will have to do a recovery
3331 if (vnnmap->size != rec->num_active) {
3332 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3333 vnnmap->size, rec->num_active));
3334 ctdb_set_culprit(rec, ctdb->pnn);
3335 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3339 /* verify that all active nodes in the nodemap also exist in
3342 for (j=0; j<nodemap->num; j++) {
3343 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3346 if (nodemap->nodes[j].pnn == pnn) {
3350 for (i=0; i<vnnmap->size; i++) {
3351 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3355 if (i == vnnmap->size) {
3356 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3357 nodemap->nodes[j].pnn));
3358 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3359 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3365 /* verify that all other nodes have the same vnnmap
3366 and are from the same generation
3368 for (j=0; j<nodemap->num; j++) {
3369 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3372 if (nodemap->nodes[j].pnn == pnn) {
3376 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3377 mem_ctx, &remote_vnnmap);
3379 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3380 nodemap->nodes[j].pnn));
3384 /* verify the vnnmap generation is the same */
3385 if (vnnmap->generation != remote_vnnmap->generation) {
3386 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3387 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3388 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3389 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3393 /* verify the vnnmap size is the same */
3394 if (vnnmap->size != remote_vnnmap->size) {
3395 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3396 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3397 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3398 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3402 /* verify the vnnmap is the same */
3403 for (i=0;i<vnnmap->size;i++) {
3404 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3405 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3406 nodemap->nodes[j].pnn));
3407 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3408 do_recovery(rec, mem_ctx, pnn, nodemap,
3415 /* we might need to change who has what IP assigned */
3416 if (rec->need_takeover_run) {
3417 uint32_t culprit = (uint32_t)-1;
3419 rec->need_takeover_run = false;
3421 /* update the list of public ips that a node can handle for
3424 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3426 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3428 rec->need_takeover_run = true;
3432 /* execute the "startrecovery" event script on all nodes */
3433 ret = run_startrecovery_eventscript(rec, nodemap);
3435 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3436 ctdb_set_culprit(rec, ctdb->pnn);
3437 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3441 ret = ctdb_takeover_run(ctdb, nodemap);
3443 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
3447 /* execute the "recovered" event script on all nodes */
3448 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3450 // we cant check whether the event completed successfully
3451 // since this script WILL fail if the node is in recovery mode
3452 // and if that race happens, the code here would just cause a second
3453 // cascading recovery.
3455 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3456 ctdb_set_culprit(rec, ctdb->pnn);
3457 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3464 the main monitoring loop
3466 static void monitor_cluster(struct ctdb_context *ctdb)
3468 struct ctdb_recoverd *rec;
3470 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3472 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3473 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3477 rec->priority_time = timeval_current();
3479 /* register a message port for sending memory dumps */
3480 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3482 /* register a message port for recovery elections */
3483 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3485 /* when nodes are disabled/enabled */
3486 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3488 /* when we are asked to puch out a flag change */
3489 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3491 /* register a message port for vacuum fetch */
3492 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3494 /* register a message port for reloadnodes */
3495 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3497 /* register a message port for performing a takeover run */
3498 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3500 /* register a message port for disabling the ip check for a short while */
3501 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3503 /* register a message port for updating the recovery daemons node assignment for an ip */
3504 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3507 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3508 struct timeval start;
3512 DEBUG(DEBUG_CRIT,(__location__
3513 " Failed to create temp context\n"));
3517 start = timeval_current();
3518 main_loop(ctdb, rec, mem_ctx);
3519 talloc_free(mem_ctx);
3521 /* we only check for recovery once every second */
3522 elapsed = timeval_elapsed(&start);
3523 if (elapsed < ctdb->tunable.recover_interval) {
3524 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3531 event handler for when the main ctdbd dies
3533 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3534 uint16_t flags, void *private_data)
3536 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3541 called regularly to verify that the recovery daemon is still running
3543 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3544 struct timeval yt, void *p)
3546 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3548 if (kill(ctdb->recoverd_pid, 0) != 0) {
3549 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3551 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3552 ctdb_restart_recd, ctdb);
3557 event_add_timed(ctdb->ev, ctdb,
3558 timeval_current_ofs(30, 0),
3559 ctdb_check_recd, ctdb);
3562 static void recd_sig_child_handler(struct event_context *ev,
3563 struct signal_event *se, int signum, int count,
3567 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3572 pid = waitpid(-1, &status, WNOHANG);
3574 if (errno != ECHILD) {
3575 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3580 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3586 startup the recovery daemon as a child of the main ctdb daemon
3588 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3591 struct signal_event *se;
3592 struct tevent_fd *fde;
3594 if (pipe(fd) != 0) {
3598 ctdb->ctdbd_pid = getpid();
3600 ctdb->recoverd_pid = fork();
3601 if (ctdb->recoverd_pid == -1) {
3605 if (ctdb->recoverd_pid != 0) {
3607 event_add_timed(ctdb->ev, ctdb,
3608 timeval_current_ofs(30, 0),
3609 ctdb_check_recd, ctdb);
3615 srandom(getpid() ^ time(NULL));
3617 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3618 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3622 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3624 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3625 ctdb_recoverd_parent, &fd[0]);
3626 tevent_fd_set_auto_close(fde);
3628 /* set up a handler to pick up sigchld */
3629 se = event_add_signal(ctdb->ev, ctdb,
3631 recd_sig_child_handler,
3634 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3638 monitor_cluster(ctdb);
3640 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3645 shutdown the recovery daemon
3647 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3649 if (ctdb->recoverd_pid == 0) {
3653 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3654 kill(ctdb->recoverd_pid, SIGTERM);
3657 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
3658 struct timeval t, void *private_data)
3660 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3662 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3663 ctdb_stop_recoverd(ctdb);
3664 ctdb_start_recoverd(ctdb);