4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "system/filesys.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/wait.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
31 #include "dlinklist.h"
34 /* list of "ctdb ipreallocate" processes to call back when we have
35 finished the takeover run.
37 struct ip_reallocate_list {
38 struct ip_reallocate_list *next;
39 struct rd_memdump_reply *rd;
42 struct ctdb_banning_state {
44 struct timeval last_reported_time;
48 private state of recovery daemon
50 struct ctdb_recoverd {
51 struct ctdb_context *ctdb;
54 uint32_t num_connected;
55 uint32_t last_culprit_node;
56 struct ctdb_node_map *nodemap;
57 struct timeval priority_time;
58 bool need_takeover_run;
61 struct timed_event *send_election_te;
62 struct timed_event *election_timeout;
63 struct vacuum_info *vacuum_info;
64 TALLOC_CTX *ip_reallocate_ctx;
65 struct ip_reallocate_list *reallocate_callers;
66 TALLOC_CTX *ip_check_disable_ctx;
67 struct ctdb_control_get_ifaces *ifaces;
68 TALLOC_CTX *deferred_rebalance_ctx;
71 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
72 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
74 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
77 ban a node for a period of time
79 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
82 struct ctdb_context *ctdb = rec->ctdb;
83 struct ctdb_ban_time bantime;
85 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
87 if (!ctdb_validate_pnn(ctdb, pnn)) {
88 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
93 bantime.time = ban_time;
95 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
97 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
103 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
107 run the "recovered" eventscript on all nodes
109 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
114 tmp_ctx = talloc_new(ctdb);
115 CTDB_NO_MEMORY(ctdb, tmp_ctx);
117 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
118 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
120 CONTROL_TIMEOUT(), false, tdb_null,
123 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
125 talloc_free(tmp_ctx);
129 talloc_free(tmp_ctx);
134 remember the trouble maker
136 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
138 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
139 struct ctdb_banning_state *ban_state;
141 if (culprit > ctdb->num_nodes) {
142 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
146 if (ctdb->nodes[culprit]->ban_state == NULL) {
147 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
148 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
152 ban_state = ctdb->nodes[culprit]->ban_state;
153 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
154 /* this was the first time in a long while this node
155 misbehaved so we will forgive any old transgressions.
157 ban_state->count = 0;
160 ban_state->count += count;
161 ban_state->last_reported_time = timeval_current();
162 rec->last_culprit_node = culprit;
166 remember the trouble maker
168 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
170 ctdb_set_culprit_count(rec, culprit, 1);
174 /* this callback is called for every node that failed to execute the
177 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
179 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
181 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
183 ctdb_set_culprit(rec, node_pnn);
187 run the "startrecovery" eventscript on all nodes
189 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
193 struct ctdb_context *ctdb = rec->ctdb;
195 tmp_ctx = talloc_new(ctdb);
196 CTDB_NO_MEMORY(ctdb, tmp_ctx);
198 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
199 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
201 CONTROL_TIMEOUT(), false, tdb_null,
203 startrecovery_fail_callback,
205 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
206 talloc_free(tmp_ctx);
210 talloc_free(tmp_ctx);
214 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
216 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
217 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
220 if (node_pnn < ctdb->num_nodes) {
221 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
226 update the node capabilities for all connected nodes
228 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
233 tmp_ctx = talloc_new(ctdb);
234 CTDB_NO_MEMORY(ctdb, tmp_ctx);
236 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
237 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
241 async_getcap_callback, NULL,
243 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
244 talloc_free(tmp_ctx);
248 talloc_free(tmp_ctx);
252 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
254 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
256 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
257 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
260 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
262 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
264 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
265 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
269 change recovery mode on all nodes
271 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
277 tmp_ctx = talloc_new(ctdb);
278 CTDB_NO_MEMORY(ctdb, tmp_ctx);
280 /* freeze all nodes */
281 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
282 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
285 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
286 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
291 set_recmode_fail_callback,
293 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
294 talloc_free(tmp_ctx);
301 data.dsize = sizeof(uint32_t);
302 data.dptr = (unsigned char *)&rec_mode;
304 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
310 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
311 talloc_free(tmp_ctx);
315 talloc_free(tmp_ctx);
320 change recovery master on all node
322 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
328 tmp_ctx = talloc_new(ctdb);
329 CTDB_NO_MEMORY(ctdb, tmp_ctx);
331 data.dsize = sizeof(uint32_t);
332 data.dptr = (unsigned char *)&pnn;
334 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
335 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
337 CONTROL_TIMEOUT(), false, data,
340 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
341 talloc_free(tmp_ctx);
345 talloc_free(tmp_ctx);
349 /* update all remote nodes to use the same db priority that we have
350 this can fail if the remove node has not yet been upgraded to
351 support this function, so we always return success and never fail
352 a recovery if this call fails.
354 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
355 struct ctdb_node_map *nodemap,
356 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
361 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
363 /* step through all local databases */
364 for (db=0; db<dbmap->num;db++) {
366 struct ctdb_db_priority db_prio;
369 db_prio.db_id = dbmap->dbs[db].dbid;
370 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
372 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
376 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
378 data.dptr = (uint8_t *)&db_prio;
379 data.dsize = sizeof(db_prio);
381 if (ctdb_client_async_control(ctdb,
382 CTDB_CONTROL_SET_DB_PRIORITY,
384 CONTROL_TIMEOUT(), false, data,
387 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
395 ensure all other nodes have attached to any databases that we have
397 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
398 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
401 struct ctdb_dbid_map *remote_dbmap;
403 /* verify that all other nodes have all our databases */
404 for (j=0; j<nodemap->num; j++) {
405 /* we dont need to ourself ourselves */
406 if (nodemap->nodes[j].pnn == pnn) {
409 /* dont check nodes that are unavailable */
410 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
414 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
415 mem_ctx, &remote_dbmap);
417 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
421 /* step through all local databases */
422 for (db=0; db<dbmap->num;db++) {
426 for (i=0;i<remote_dbmap->num;i++) {
427 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
431 /* the remote node already have this database */
432 if (i!=remote_dbmap->num) {
435 /* ok so we need to create this database */
436 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
439 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
442 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
443 mem_ctx, name, dbmap->dbs[db].persistent);
445 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
456 ensure we are attached to any databases that anyone else is attached to
458 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
459 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
462 struct ctdb_dbid_map *remote_dbmap;
464 /* verify that we have all database any other node has */
465 for (j=0; j<nodemap->num; j++) {
466 /* we dont need to ourself ourselves */
467 if (nodemap->nodes[j].pnn == pnn) {
470 /* dont check nodes that are unavailable */
471 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
475 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
476 mem_ctx, &remote_dbmap);
478 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
482 /* step through all databases on the remote node */
483 for (db=0; db<remote_dbmap->num;db++) {
486 for (i=0;i<(*dbmap)->num;i++) {
487 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
491 /* we already have this db locally */
492 if (i!=(*dbmap)->num) {
495 /* ok so we need to create this database and
498 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
499 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
501 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
502 nodemap->nodes[j].pnn));
505 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
506 remote_dbmap->dbs[db].persistent);
508 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
511 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
513 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
524 pull the remote database contents from one node into the recdb
526 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
527 struct tdb_wrap *recdb, uint32_t dbid,
532 struct ctdb_marshall_buffer *reply;
533 struct ctdb_rec_data *rec;
535 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
537 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
538 CONTROL_TIMEOUT(), &outdata);
540 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
541 talloc_free(tmp_ctx);
545 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
547 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
548 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
549 talloc_free(tmp_ctx);
553 rec = (struct ctdb_rec_data *)&reply->data[0];
557 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
559 struct ctdb_ltdb_header *hdr;
562 key.dptr = &rec->data[0];
563 key.dsize = rec->keylen;
564 data.dptr = &rec->data[key.dsize];
565 data.dsize = rec->datalen;
567 hdr = (struct ctdb_ltdb_header *)data.dptr;
569 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
570 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
571 talloc_free(tmp_ctx);
575 /* fetch the existing record, if any */
576 existing = tdb_fetch(recdb->tdb, key);
578 if (existing.dptr != NULL) {
579 struct ctdb_ltdb_header header;
580 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
581 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
582 (unsigned)existing.dsize, srcnode));
584 talloc_free(tmp_ctx);
587 header = *(struct ctdb_ltdb_header *)existing.dptr;
589 if (!(header.rsn < hdr->rsn ||
590 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
595 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
596 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
597 talloc_free(tmp_ctx);
602 talloc_free(tmp_ctx);
608 pull all the remote database contents into the recdb
610 static int pull_remote_database(struct ctdb_context *ctdb,
611 struct ctdb_recoverd *rec,
612 struct ctdb_node_map *nodemap,
613 struct tdb_wrap *recdb, uint32_t dbid,
618 /* pull all records from all other nodes across onto this node
619 (this merges based on rsn)
621 for (j=0; j<nodemap->num; j++) {
622 /* dont merge from nodes that are unavailable */
623 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
626 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
627 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
628 nodemap->nodes[j].pnn));
629 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
639 update flags on all active nodes
641 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
645 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
647 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
655 ensure all nodes have the same vnnmap we do
657 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
658 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
662 /* push the new vnn map out to all the nodes */
663 for (j=0; j<nodemap->num; j++) {
664 /* dont push to nodes that are unavailable */
665 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
669 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
671 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
681 struct vacuum_info *next, *prev;
682 struct ctdb_recoverd *rec;
684 struct ctdb_db_context *ctdb_db;
685 struct ctdb_marshall_buffer *recs;
686 struct ctdb_rec_data *r;
689 static void vacuum_fetch_next(struct vacuum_info *v);
692 called when a vacuum fetch has completed - just free it and do the next one
694 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
696 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
698 vacuum_fetch_next(v);
703 process the next element from the vacuum list
705 static void vacuum_fetch_next(struct vacuum_info *v)
707 struct ctdb_call call;
708 struct ctdb_rec_data *r;
710 while (v->recs->count) {
711 struct ctdb_client_call_state *state;
713 struct ctdb_ltdb_header *hdr;
716 call.call_id = CTDB_NULL_FUNC;
717 call.flags = CTDB_IMMEDIATE_MIGRATION;
718 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
721 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
724 call.key.dptr = &r->data[0];
725 call.key.dsize = r->keylen;
727 /* ensure we don't block this daemon - just skip a record if we can't get
729 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
733 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
734 if (data.dptr == NULL) {
735 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
739 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
741 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
745 hdr = (struct ctdb_ltdb_header *)data.dptr;
746 if (hdr->dmaster == v->rec->ctdb->pnn) {
747 /* its already local */
749 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
755 state = ctdb_call_send(v->ctdb_db, &call);
756 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
758 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
762 state->async.fn = vacuum_fetch_callback;
763 state->async.private_data = v;
772 destroy a vacuum info structure
774 static int vacuum_info_destructor(struct vacuum_info *v)
776 DLIST_REMOVE(v->rec->vacuum_info, v);
782 handler for vacuum fetch
784 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
785 TDB_DATA data, void *private_data)
787 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
788 struct ctdb_marshall_buffer *recs;
790 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
792 struct ctdb_dbid_map *dbmap=NULL;
793 bool persistent = false;
794 struct ctdb_db_context *ctdb_db;
795 struct ctdb_rec_data *r;
797 struct vacuum_info *v;
799 recs = (struct ctdb_marshall_buffer *)data.dptr;
800 r = (struct ctdb_rec_data *)&recs->data[0];
802 if (recs->count == 0) {
803 talloc_free(tmp_ctx);
809 for (v=rec->vacuum_info;v;v=v->next) {
810 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
811 /* we're already working on records from this node */
812 talloc_free(tmp_ctx);
817 /* work out if the database is persistent */
818 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
820 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
821 talloc_free(tmp_ctx);
825 for (i=0;i<dbmap->num;i++) {
826 if (dbmap->dbs[i].dbid == recs->db_id) {
827 persistent = dbmap->dbs[i].persistent;
831 if (i == dbmap->num) {
832 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
833 talloc_free(tmp_ctx);
837 /* find the name of this database */
838 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
839 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
840 talloc_free(tmp_ctx);
845 ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
846 if (ctdb_db == NULL) {
847 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
848 talloc_free(tmp_ctx);
852 v = talloc_zero(rec, struct vacuum_info);
854 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
855 talloc_free(tmp_ctx);
860 v->srcnode = srcnode;
861 v->ctdb_db = ctdb_db;
862 v->recs = talloc_memdup(v, recs, data.dsize);
863 if (v->recs == NULL) {
864 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
866 talloc_free(tmp_ctx);
869 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
871 DLIST_ADD(rec->vacuum_info, v);
873 talloc_set_destructor(v, vacuum_info_destructor);
875 vacuum_fetch_next(v);
876 talloc_free(tmp_ctx);
881 called when ctdb_wait_timeout should finish
883 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
884 struct timeval yt, void *p)
886 uint32_t *timed_out = (uint32_t *)p;
891 wait for a given number of seconds
893 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
895 uint32_t timed_out = 0;
896 time_t usecs = (secs - (time_t)secs) * 1000000;
897 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
899 event_loop_once(ctdb->ev);
904 called when an election times out (ends)
906 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
907 struct timeval t, void *p)
909 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
910 rec->election_timeout = NULL;
913 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
918 wait for an election to finish. It finished election_timeout seconds after
919 the last election packet is received
921 static void ctdb_wait_election(struct ctdb_recoverd *rec)
923 struct ctdb_context *ctdb = rec->ctdb;
924 while (rec->election_timeout) {
925 event_loop_once(ctdb->ev);
930 Update our local flags from all remote connected nodes.
931 This is only run when we are or we belive we are the recovery master
933 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
936 struct ctdb_context *ctdb = rec->ctdb;
937 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
939 /* get the nodemap for all active remote nodes and verify
940 they are the same as for this node
942 for (j=0; j<nodemap->num; j++) {
943 struct ctdb_node_map *remote_nodemap=NULL;
946 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
949 if (nodemap->nodes[j].pnn == ctdb->pnn) {
953 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
954 mem_ctx, &remote_nodemap);
956 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
957 nodemap->nodes[j].pnn));
958 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
959 talloc_free(mem_ctx);
960 return MONITOR_FAILED;
962 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
963 /* We should tell our daemon about this so it
964 updates its flags or else we will log the same
965 message again in the next iteration of recovery.
966 Since we are the recovery master we can just as
967 well update the flags on all nodes.
969 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
971 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
975 /* Update our local copy of the flags in the recovery
978 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
979 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
980 nodemap->nodes[j].flags));
981 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
983 talloc_free(remote_nodemap);
985 talloc_free(mem_ctx);
990 /* Create a new random generation ip.
991 The generation id can not be the INVALID_GENERATION id
993 static uint32_t new_generation(void)
998 generation = random();
1000 if (generation != INVALID_GENERATION) {
1010 create a temporary working database
1012 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1015 struct tdb_wrap *recdb;
1018 /* open up the temporary recovery database */
1019 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1020 ctdb->db_directory_state,
1027 tdb_flags = TDB_NOLOCK;
1028 if (ctdb->valgrinding) {
1029 tdb_flags |= TDB_NOMMAP;
1031 tdb_flags |= TDB_DISALLOW_NESTING;
1033 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1034 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1035 if (recdb == NULL) {
1036 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1046 a traverse function for pulling all relevent records from recdb
1049 struct ctdb_context *ctdb;
1050 struct ctdb_marshall_buffer *recdata;
1056 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1058 struct recdb_data *params = (struct recdb_data *)p;
1059 struct ctdb_rec_data *rec;
1060 struct ctdb_ltdb_header *hdr;
1062 /* skip empty records */
1063 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1067 /* update the dmaster field to point to us */
1068 hdr = (struct ctdb_ltdb_header *)data.dptr;
1069 if (!params->persistent) {
1070 hdr->dmaster = params->ctdb->pnn;
1071 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1074 /* add the record to the blob ready to send to the nodes */
1075 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1077 params->failed = true;
1080 params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1081 if (params->recdata == NULL) {
1082 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1083 rec->length + params->len, params->recdata->count));
1084 params->failed = true;
1087 params->recdata->count++;
1088 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1089 params->len += rec->length;
1096 push the recdb database out to all nodes
1098 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1100 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1102 struct recdb_data params;
1103 struct ctdb_marshall_buffer *recdata;
1105 TALLOC_CTX *tmp_ctx;
1108 tmp_ctx = talloc_new(ctdb);
1109 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1111 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1112 CTDB_NO_MEMORY(ctdb, recdata);
1114 recdata->db_id = dbid;
1117 params.recdata = recdata;
1118 params.len = offsetof(struct ctdb_marshall_buffer, data);
1119 params.failed = false;
1120 params.persistent = persistent;
1122 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1123 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1124 talloc_free(params.recdata);
1125 talloc_free(tmp_ctx);
1129 if (params.failed) {
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1131 talloc_free(params.recdata);
1132 talloc_free(tmp_ctx);
1136 recdata = params.recdata;
1138 outdata.dptr = (void *)recdata;
1139 outdata.dsize = params.len;
1141 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1142 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1144 CONTROL_TIMEOUT(), false, outdata,
1147 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1148 talloc_free(recdata);
1149 talloc_free(tmp_ctx);
1153 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1154 dbid, recdata->count));
1156 talloc_free(recdata);
1157 talloc_free(tmp_ctx);
1164 go through a full recovery on one database
1166 static int recover_database(struct ctdb_recoverd *rec,
1167 TALLOC_CTX *mem_ctx,
1171 struct ctdb_node_map *nodemap,
1172 uint32_t transaction_id)
1174 struct tdb_wrap *recdb;
1176 struct ctdb_context *ctdb = rec->ctdb;
1178 struct ctdb_control_wipe_database w;
1181 recdb = create_recdb(ctdb, mem_ctx);
1182 if (recdb == NULL) {
1186 /* pull all remote databases onto the recdb */
1187 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1189 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1193 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1195 /* wipe all the remote databases. This is safe as we are in a transaction */
1197 w.transaction_id = transaction_id;
1199 data.dptr = (void *)&w;
1200 data.dsize = sizeof(w);
1202 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1203 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1205 CONTROL_TIMEOUT(), false, data,
1208 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1213 /* push out the correct database. This sets the dmaster and skips
1214 the empty records */
1215 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1221 /* all done with this database */
1228 reload the nodes file
1230 static void reload_nodes_file(struct ctdb_context *ctdb)
1233 ctdb_load_nodes_file(ctdb);
1236 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1237 struct ctdb_recoverd *rec,
1238 struct ctdb_node_map *nodemap,
1244 if (ctdb->num_nodes != nodemap->num) {
1245 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1246 ctdb->num_nodes, nodemap->num));
1248 *culprit = ctdb->pnn;
1253 for (j=0; j<nodemap->num; j++) {
1254 /* release any existing data */
1255 if (ctdb->nodes[j]->known_public_ips) {
1256 talloc_free(ctdb->nodes[j]->known_public_ips);
1257 ctdb->nodes[j]->known_public_ips = NULL;
1259 if (ctdb->nodes[j]->available_public_ips) {
1260 talloc_free(ctdb->nodes[j]->available_public_ips);
1261 ctdb->nodes[j]->available_public_ips = NULL;
1264 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1268 /* grab a new shiny list of public ips from the node */
1269 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1271 ctdb->nodes[j]->pnn,
1274 &ctdb->nodes[j]->known_public_ips);
1276 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1277 ctdb->nodes[j]->pnn));
1279 *culprit = ctdb->nodes[j]->pnn;
1284 if (ctdb->tunable.disable_ip_failover == 0) {
1285 if (rec->ip_check_disable_ctx == NULL) {
1286 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1287 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1288 rec->need_takeover_run = true;
1293 /* grab a new shiny list of public ips from the node */
1294 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1296 ctdb->nodes[j]->pnn,
1298 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1299 &ctdb->nodes[j]->available_public_ips);
1301 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1302 ctdb->nodes[j]->pnn));
1304 *culprit = ctdb->nodes[j]->pnn;
1313 /* when we start a recovery, make sure all nodes use the same reclock file
1316 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1318 struct ctdb_context *ctdb = rec->ctdb;
1319 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1323 if (ctdb->recovery_lock_file == NULL) {
1327 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1328 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1331 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1338 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1339 talloc_free(tmp_ctx);
1343 talloc_free(tmp_ctx);
1349 we are the recmaster, and recovery is needed - start a recovery run
1351 static int do_recovery(struct ctdb_recoverd *rec,
1352 TALLOC_CTX *mem_ctx, uint32_t pnn,
1353 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1355 struct ctdb_context *ctdb = rec->ctdb;
1357 uint32_t generation;
1358 struct ctdb_dbid_map *dbmap;
1361 struct timeval start_time;
1362 uint32_t culprit = (uint32_t)-1;
1364 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1366 /* if recovery fails, force it again */
1367 rec->need_recovery = true;
1369 for (i=0; i<ctdb->num_nodes; i++) {
1370 struct ctdb_banning_state *ban_state;
1372 if (ctdb->nodes[i]->ban_state == NULL) {
1375 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1376 if (ban_state->count < 2*ctdb->num_nodes) {
1379 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1380 ctdb->nodes[i]->pnn, ban_state->count,
1381 ctdb->tunable.recovery_ban_period));
1382 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1383 ban_state->count = 0;
1387 if (ctdb->tunable.verify_recovery_lock != 0) {
1388 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1389 start_time = timeval_current();
1390 if (!ctdb_recovery_lock(ctdb, true)) {
1391 ctdb_set_culprit(rec, pnn);
1392 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1395 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1396 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1399 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1401 /* get a list of all databases */
1402 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1404 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1408 /* we do the db creation before we set the recovery mode, so the freeze happens
1409 on all databases we will be dealing with. */
1411 /* verify that we have all the databases any other node has */
1412 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1414 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1418 /* verify that all other nodes have all our databases */
1419 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1421 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1424 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1426 /* update the database priority for all remote databases */
1427 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1429 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1431 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1434 /* update all other nodes to use the same setting for reclock files
1435 as the local recovery master.
1437 sync_recovery_lock_file_across_cluster(rec);
1439 /* set recovery mode to active on all nodes */
1440 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1442 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1446 /* execute the "startrecovery" event script on all nodes */
1447 ret = run_startrecovery_eventscript(rec, nodemap);
1449 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1454 update all nodes to have the same flags that we have
1456 for (i=0;i<nodemap->num;i++) {
1457 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1461 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1463 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1468 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1470 /* pick a new generation number */
1471 generation = new_generation();
1473 /* change the vnnmap on this node to use the new generation
1474 number but not on any other nodes.
1475 this guarantees that if we abort the recovery prematurely
1476 for some reason (a node stops responding?)
1477 that we can just return immediately and we will reenter
1478 recovery shortly again.
1479 I.e. we deliberately leave the cluster with an inconsistent
1480 generation id to allow us to abort recovery at any stage and
1481 just restart it from scratch.
1483 vnnmap->generation = generation;
1484 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1486 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1490 data.dptr = (void *)&generation;
1491 data.dsize = sizeof(uint32_t);
1493 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1494 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1496 CONTROL_TIMEOUT(), false, data,
1498 transaction_start_fail_callback,
1500 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1501 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1503 CONTROL_TIMEOUT(), false, tdb_null,
1507 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1512 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1514 for (i=0;i<dbmap->num;i++) {
1515 ret = recover_database(rec, mem_ctx,
1517 dbmap->dbs[i].persistent,
1518 pnn, nodemap, generation);
1520 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1525 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1527 /* commit all the changes */
1528 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1530 CONTROL_TIMEOUT(), false, data,
1533 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1537 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1540 /* update the capabilities for all nodes */
1541 ret = update_capabilities(ctdb, nodemap);
1543 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1547 /* build a new vnn map with all the currently active and
1549 generation = new_generation();
1550 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1551 CTDB_NO_MEMORY(ctdb, vnnmap);
1552 vnnmap->generation = generation;
1554 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1555 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1556 for (i=j=0;i<nodemap->num;i++) {
1557 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1560 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1561 /* this node can not be an lmaster */
1562 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1567 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1568 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1569 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1572 if (vnnmap->size == 0) {
1573 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1575 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1576 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1577 vnnmap->map[0] = pnn;
1580 /* update to the new vnnmap on all nodes */
1581 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1583 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1587 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1589 /* update recmaster to point to us for all nodes */
1590 ret = set_recovery_master(ctdb, nodemap, pnn);
1592 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1596 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1599 update all nodes to have the same flags that we have
1601 for (i=0;i<nodemap->num;i++) {
1602 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1606 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1608 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1613 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1615 /* disable recovery mode */
1616 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1618 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1622 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1625 tell nodes to takeover their public IPs
1627 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1629 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1631 rec->need_takeover_run = true;
1634 rec->need_takeover_run = false;
1635 ret = ctdb_takeover_run(ctdb, nodemap);
1637 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1638 rec->need_takeover_run = true;
1641 /* execute the "recovered" event script on all nodes */
1642 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1644 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1648 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1650 /* send a message to all clients telling them that the cluster
1651 has been reconfigured */
1652 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1654 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1656 rec->need_recovery = false;
1658 /* we managed to complete a full recovery, make sure to forgive
1659 any past sins by the nodes that could now participate in the
1662 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1663 for (i=0;i<nodemap->num;i++) {
1664 struct ctdb_banning_state *ban_state;
1666 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1670 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1671 if (ban_state == NULL) {
1675 ban_state->count = 0;
1679 /* We just finished a recovery successfully.
1680 We now wait for rerecovery_timeout before we allow
1681 another recovery to take place.
1683 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1684 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1685 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1692 elections are won by first checking the number of connected nodes, then
1693 the priority time, then the pnn
1695 struct election_message {
1696 uint32_t num_connected;
1697 struct timeval priority_time;
1699 uint32_t node_flags;
1703 form this nodes election data
1705 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1708 struct ctdb_node_map *nodemap;
1709 struct ctdb_context *ctdb = rec->ctdb;
1713 em->pnn = rec->ctdb->pnn;
1714 em->priority_time = rec->priority_time;
1716 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1718 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1722 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1723 em->node_flags = rec->node_flags;
1725 for (i=0;i<nodemap->num;i++) {
1726 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1727 em->num_connected++;
1731 /* we shouldnt try to win this election if we cant be a recmaster */
1732 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1733 em->num_connected = 0;
1734 em->priority_time = timeval_current();
1737 talloc_free(nodemap);
1741 see if the given election data wins
1743 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1745 struct election_message myem;
1748 ctdb_election_data(rec, &myem);
1750 /* we cant win if we dont have the recmaster capability */
1751 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1755 /* we cant win if we are banned */
1756 if (rec->node_flags & NODE_FLAGS_BANNED) {
1760 /* we cant win if we are stopped */
1761 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1765 /* we will automatically win if the other node is banned */
1766 if (em->node_flags & NODE_FLAGS_BANNED) {
1770 /* we will automatically win if the other node is banned */
1771 if (em->node_flags & NODE_FLAGS_STOPPED) {
1775 /* try to use the most connected node */
1777 cmp = (int)myem.num_connected - (int)em->num_connected;
1780 /* then the longest running node */
1782 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1786 cmp = (int)myem.pnn - (int)em->pnn;
1793 send out an election request
1795 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1798 TDB_DATA election_data;
1799 struct election_message emsg;
1801 struct ctdb_context *ctdb = rec->ctdb;
1803 srvid = CTDB_SRVID_RECOVERY;
1805 ctdb_election_data(rec, &emsg);
1807 election_data.dsize = sizeof(struct election_message);
1808 election_data.dptr = (unsigned char *)&emsg;
1811 /* send an election message to all active nodes */
1812 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1813 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1816 /* A new node that is already frozen has entered the cluster.
1817 The existing nodes are not frozen and dont need to be frozen
1818 until the election has ended and we start the actual recovery
1820 if (update_recmaster == true) {
1821 /* first we assume we will win the election and set
1822 recoverymaster to be ourself on the current node
1824 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1826 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1836 this function will unban all nodes in the cluster
1838 static void unban_all_nodes(struct ctdb_context *ctdb)
1841 struct ctdb_node_map *nodemap;
1842 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1844 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1846 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1850 for (i=0;i<nodemap->num;i++) {
1851 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1852 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1853 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1857 talloc_free(tmp_ctx);
1862 we think we are winning the election - send a broadcast election request
1864 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1866 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1869 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1871 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1874 talloc_free(rec->send_election_te);
1875 rec->send_election_te = NULL;
1879 handler for memory dumps
1881 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1882 TDB_DATA data, void *private_data)
1884 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1887 struct rd_memdump_reply *rd;
1889 if (data.dsize != sizeof(struct rd_memdump_reply)) {
1890 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1891 talloc_free(tmp_ctx);
1894 rd = (struct rd_memdump_reply *)data.dptr;
1896 dump = talloc_zero(tmp_ctx, TDB_DATA);
1898 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1899 talloc_free(tmp_ctx);
1902 ret = ctdb_dump_memory(ctdb, dump);
1904 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1905 talloc_free(tmp_ctx);
1909 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1911 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1913 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1914 talloc_free(tmp_ctx);
1918 talloc_free(tmp_ctx);
1922 handler for reload_nodes
1924 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1925 TDB_DATA data, void *private_data)
1927 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1929 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1931 reload_nodes_file(rec->ctdb);
1935 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1936 struct timeval yt, void *p)
1938 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1940 talloc_free(rec->ip_check_disable_ctx);
1941 rec->ip_check_disable_ctx = NULL;
1945 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
1946 struct timeval t, void *p)
1948 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1949 struct ctdb_context *ctdb = rec->ctdb;
1952 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
1954 ret = ctdb_takeover_run(ctdb, rec->nodemap);
1956 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1957 rec->need_takeover_run = true;
1960 talloc_free(rec->deferred_rebalance_ctx);
1961 rec->deferred_rebalance_ctx = NULL;
1965 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
1966 TDB_DATA data, void *private_data)
1969 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1971 if (data.dsize != sizeof(uint32_t)) {
1972 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1976 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
1980 pnn = *(uint32_t *)&data.dptr[0];
1982 lcp2_forcerebalance(ctdb, pnn);
1983 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
1985 if (rec->deferred_rebalance_ctx != NULL) {
1986 talloc_free(rec->deferred_rebalance_ctx);
1988 rec->deferred_rebalance_ctx = talloc_new(rec);
1989 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
1990 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
1991 ctdb_rebalance_timeout, rec);
1996 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
1997 TDB_DATA data, void *private_data)
1999 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2000 struct ctdb_public_ip *ip;
2002 if (rec->recmaster != rec->ctdb->pnn) {
2003 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2007 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2008 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2012 ip = (struct ctdb_public_ip *)data.dptr;
2014 update_ip_assignment_tree(rec->ctdb, ip);
2018 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2019 TDB_DATA data, void *private_data)
2021 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2024 if (rec->ip_check_disable_ctx != NULL) {
2025 talloc_free(rec->ip_check_disable_ctx);
2026 rec->ip_check_disable_ctx = NULL;
2029 if (data.dsize != sizeof(uint32_t)) {
2030 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2031 "expexting %lu\n", (long unsigned)data.dsize,
2032 (long unsigned)sizeof(uint32_t)));
2035 if (data.dptr == NULL) {
2036 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2040 timeout = *((uint32_t *)data.dptr);
2041 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2043 rec->ip_check_disable_ctx = talloc_new(rec);
2044 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2046 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2051 handler for ip reallocate, just add it to the list of callers and
2052 handle this later in the monitor_cluster loop so we do not recurse
2053 with other callers to takeover_run()
2055 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2056 TDB_DATA data, void *private_data)
2058 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2059 struct ip_reallocate_list *caller;
2061 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2062 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2066 if (rec->ip_reallocate_ctx == NULL) {
2067 rec->ip_reallocate_ctx = talloc_new(rec);
2068 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2071 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2072 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2074 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2075 caller->next = rec->reallocate_callers;
2076 rec->reallocate_callers = caller;
2081 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2083 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2086 struct ip_reallocate_list *callers;
2089 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2091 /* update the list of public ips that a node can handle for
2094 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2096 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2098 rec->need_takeover_run = true;
2101 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2103 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2104 rec->need_takeover_run = true;
2108 result.dsize = sizeof(int32_t);
2109 result.dptr = (uint8_t *)&ret;
2111 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2113 /* Someone that sent srvid==0 does not want a reply */
2114 if (callers->rd->srvid == 0) {
2117 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2118 "%u:%llu\n", (unsigned)callers->rd->pnn,
2119 (unsigned long long)callers->rd->srvid));
2120 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2122 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2123 "message to %u:%llu\n",
2124 (unsigned)callers->rd->pnn,
2125 (unsigned long long)callers->rd->srvid));
2129 talloc_free(tmp_ctx);
2130 talloc_free(rec->ip_reallocate_ctx);
2131 rec->ip_reallocate_ctx = NULL;
2132 rec->reallocate_callers = NULL;
2138 handler for recovery master elections
2140 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2141 TDB_DATA data, void *private_data)
2143 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2145 struct election_message *em = (struct election_message *)data.dptr;
2146 TALLOC_CTX *mem_ctx;
2148 /* we got an election packet - update the timeout for the election */
2149 talloc_free(rec->election_timeout);
2150 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2152 timeval_current_ofs(0, 500000) :
2153 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2154 ctdb_election_timeout, rec);
2156 mem_ctx = talloc_new(ctdb);
2158 /* someone called an election. check their election data
2159 and if we disagree and we would rather be the elected node,
2160 send a new election message to all other nodes
2162 if (ctdb_election_win(rec, em)) {
2163 if (!rec->send_election_te) {
2164 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2165 timeval_current_ofs(0, 500000),
2166 election_send_request, rec);
2168 talloc_free(mem_ctx);
2169 /*unban_all_nodes(ctdb);*/
2174 talloc_free(rec->send_election_te);
2175 rec->send_election_te = NULL;
2177 if (ctdb->tunable.verify_recovery_lock != 0) {
2178 /* release the recmaster lock */
2179 if (em->pnn != ctdb->pnn &&
2180 ctdb->recovery_lock_fd != -1) {
2181 close(ctdb->recovery_lock_fd);
2182 ctdb->recovery_lock_fd = -1;
2183 unban_all_nodes(ctdb);
2187 /* ok, let that guy become recmaster then */
2188 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2190 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2191 talloc_free(mem_ctx);
2195 talloc_free(mem_ctx);
2201 force the start of the election process
2203 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2204 struct ctdb_node_map *nodemap)
2207 struct ctdb_context *ctdb = rec->ctdb;
2209 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2211 /* set all nodes to recovery mode to stop all internode traffic */
2212 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2214 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2218 talloc_free(rec->election_timeout);
2219 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2221 timeval_current_ofs(0, 500000) :
2222 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2223 ctdb_election_timeout, rec);
2225 ret = send_election_request(rec, pnn, true);
2227 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2231 /* wait for a few seconds to collect all responses */
2232 ctdb_wait_election(rec);
2238 handler for when a node changes its flags
2240 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2241 TDB_DATA data, void *private_data)
2244 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2245 struct ctdb_node_map *nodemap=NULL;
2246 TALLOC_CTX *tmp_ctx;
2247 uint32_t changed_flags;
2249 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2250 int disabled_flag_changed;
2252 if (data.dsize != sizeof(*c)) {
2253 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2257 tmp_ctx = talloc_new(ctdb);
2258 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2260 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2262 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2263 talloc_free(tmp_ctx);
2268 for (i=0;i<nodemap->num;i++) {
2269 if (nodemap->nodes[i].pnn == c->pnn) break;
2272 if (i == nodemap->num) {
2273 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2274 talloc_free(tmp_ctx);
2278 changed_flags = c->old_flags ^ c->new_flags;
2280 if (nodemap->nodes[i].flags != c->new_flags) {
2281 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2284 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2286 nodemap->nodes[i].flags = c->new_flags;
2288 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2289 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2292 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2293 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2297 ctdb->recovery_master == ctdb->pnn &&
2298 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2299 /* Only do the takeover run if the perm disabled or unhealthy
2300 flags changed since these will cause an ip failover but not
2302 If the node became disconnected or banned this will also
2303 lead to an ip address failover but that is handled
2306 if (disabled_flag_changed) {
2307 rec->need_takeover_run = true;
2311 talloc_free(tmp_ctx);
2315 handler for when we need to push out flag changes ot all other nodes
2317 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2318 TDB_DATA data, void *private_data)
2321 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2322 struct ctdb_node_map *nodemap=NULL;
2323 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2327 /* find the recovery master */
2328 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2330 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2331 talloc_free(tmp_ctx);
2335 /* read the node flags from the recmaster */
2336 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2338 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2339 talloc_free(tmp_ctx);
2342 if (c->pnn >= nodemap->num) {
2343 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2344 talloc_free(tmp_ctx);
2348 /* send the flags update to all connected nodes */
2349 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2351 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2352 nodes, 0, CONTROL_TIMEOUT(),
2356 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2358 talloc_free(tmp_ctx);
2362 talloc_free(tmp_ctx);
2366 struct verify_recmode_normal_data {
2368 enum monitor_result status;
2371 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2373 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2376 /* one more node has responded with recmode data*/
2379 /* if we failed to get the recmode, then return an error and let
2380 the main loop try again.
2382 if (state->state != CTDB_CONTROL_DONE) {
2383 if (rmdata->status == MONITOR_OK) {
2384 rmdata->status = MONITOR_FAILED;
2389 /* if we got a response, then the recmode will be stored in the
2392 if (state->status != CTDB_RECOVERY_NORMAL) {
2393 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2394 rmdata->status = MONITOR_RECOVERY_NEEDED;
2401 /* verify that all nodes are in normal recovery mode */
2402 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2404 struct verify_recmode_normal_data *rmdata;
2405 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2406 struct ctdb_client_control_state *state;
2407 enum monitor_result status;
2410 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2411 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2413 rmdata->status = MONITOR_OK;
2415 /* loop over all active nodes and send an async getrecmode call to
2417 for (j=0; j<nodemap->num; j++) {
2418 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2421 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2423 nodemap->nodes[j].pnn);
2424 if (state == NULL) {
2425 /* we failed to send the control, treat this as
2426 an error and try again next iteration
2428 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2429 talloc_free(mem_ctx);
2430 return MONITOR_FAILED;
2433 /* set up the callback functions */
2434 state->async.fn = verify_recmode_normal_callback;
2435 state->async.private_data = rmdata;
2437 /* one more control to wait for to complete */
2442 /* now wait for up to the maximum number of seconds allowed
2443 or until all nodes we expect a response from has replied
2445 while (rmdata->count > 0) {
2446 event_loop_once(ctdb->ev);
2449 status = rmdata->status;
2450 talloc_free(mem_ctx);
2455 struct verify_recmaster_data {
2456 struct ctdb_recoverd *rec;
2459 enum monitor_result status;
2462 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2464 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2467 /* one more node has responded with recmaster data*/
2470 /* if we failed to get the recmaster, then return an error and let
2471 the main loop try again.
2473 if (state->state != CTDB_CONTROL_DONE) {
2474 if (rmdata->status == MONITOR_OK) {
2475 rmdata->status = MONITOR_FAILED;
2480 /* if we got a response, then the recmaster will be stored in the
2483 if (state->status != rmdata->pnn) {
2484 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2485 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2486 rmdata->status = MONITOR_ELECTION_NEEDED;
2493 /* verify that all nodes agree that we are the recmaster */
2494 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2496 struct ctdb_context *ctdb = rec->ctdb;
2497 struct verify_recmaster_data *rmdata;
2498 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2499 struct ctdb_client_control_state *state;
2500 enum monitor_result status;
2503 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2504 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2508 rmdata->status = MONITOR_OK;
2510 /* loop over all active nodes and send an async getrecmaster call to
2512 for (j=0; j<nodemap->num; j++) {
2513 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2516 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2518 nodemap->nodes[j].pnn);
2519 if (state == NULL) {
2520 /* we failed to send the control, treat this as
2521 an error and try again next iteration
2523 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2524 talloc_free(mem_ctx);
2525 return MONITOR_FAILED;
2528 /* set up the callback functions */
2529 state->async.fn = verify_recmaster_callback;
2530 state->async.private_data = rmdata;
2532 /* one more control to wait for to complete */
2537 /* now wait for up to the maximum number of seconds allowed
2538 or until all nodes we expect a response from has replied
2540 while (rmdata->count > 0) {
2541 event_loop_once(ctdb->ev);
2544 status = rmdata->status;
2545 talloc_free(mem_ctx);
2550 /* called to check that the local allocation of public ip addresses is ok.
2552 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2554 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2555 struct ctdb_control_get_ifaces *ifaces = NULL;
2556 struct ctdb_all_public_ips *ips = NULL;
2557 struct ctdb_uptime *uptime1 = NULL;
2558 struct ctdb_uptime *uptime2 = NULL;
2560 bool need_iface_check = false;
2561 bool need_takeover_run = false;
2563 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2564 CTDB_CURRENT_NODE, &uptime1);
2566 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2567 talloc_free(mem_ctx);
2572 /* read the interfaces from the local node */
2573 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2575 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2576 talloc_free(mem_ctx);
2581 need_iface_check = true;
2582 } else if (rec->ifaces->num != ifaces->num) {
2583 need_iface_check = true;
2584 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2585 need_iface_check = true;
2588 if (need_iface_check) {
2589 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2590 "local node %u - force takeover run\n",
2592 need_takeover_run = true;
2595 /* read the ip allocation from the local node */
2596 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2598 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2599 talloc_free(mem_ctx);
2603 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2604 CTDB_CURRENT_NODE, &uptime2);
2606 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2607 talloc_free(mem_ctx);
2611 /* skip the check if the startrecovery time has changed */
2612 if (timeval_compare(&uptime1->last_recovery_started,
2613 &uptime2->last_recovery_started) != 0) {
2614 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2615 talloc_free(mem_ctx);
2619 /* skip the check if the endrecovery time has changed */
2620 if (timeval_compare(&uptime1->last_recovery_finished,
2621 &uptime2->last_recovery_finished) != 0) {
2622 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2623 talloc_free(mem_ctx);
2627 /* skip the check if we have started but not finished recovery */
2628 if (timeval_compare(&uptime1->last_recovery_finished,
2629 &uptime1->last_recovery_started) != 1) {
2630 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2631 talloc_free(mem_ctx);
2636 talloc_free(rec->ifaces);
2637 rec->ifaces = talloc_steal(rec, ifaces);
2639 /* verify that we have the ip addresses we should have
2640 and we dont have ones we shouldnt have.
2641 if we find an inconsistency we set recmode to
2642 active on the local node and wait for the recmaster
2643 to do a full blown recovery.
2644 also if the pnn is -1 and we are healthy and can host the ip
2645 we also request a ip reallocation.
2647 if (ctdb->tunable.disable_ip_failover == 0) {
2648 for (j=0; j<ips->num; j++) {
2649 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2650 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2651 ctdb_addr_to_str(&ips->ips[j].addr)));
2652 need_takeover_run = true;
2653 } else if (ips->ips[j].pnn == pnn) {
2654 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2655 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2656 ctdb_addr_to_str(&ips->ips[j].addr)));
2657 need_takeover_run = true;
2660 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2661 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2662 ctdb_addr_to_str(&ips->ips[j].addr)));
2663 need_takeover_run = true;
2669 if (need_takeover_run) {
2670 struct takeover_run_reply rd;
2673 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2677 data.dptr = (uint8_t *)&rd;
2678 data.dsize = sizeof(rd);
2680 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2682 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2685 talloc_free(mem_ctx);
2690 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2692 struct ctdb_node_map **remote_nodemaps = callback_data;
2694 if (node_pnn >= ctdb->num_nodes) {
2695 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2699 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2703 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2704 struct ctdb_node_map *nodemap,
2705 struct ctdb_node_map **remote_nodemaps)
2709 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2710 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2712 CONTROL_TIMEOUT(), false, tdb_null,
2713 async_getnodemap_callback,
2715 remote_nodemaps) != 0) {
2716 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2724 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2725 struct ctdb_check_reclock_state {
2726 struct ctdb_context *ctdb;
2727 struct timeval start_time;
2730 struct timed_event *te;
2731 struct fd_event *fde;
2732 enum reclock_child_status status;
2735 /* when we free the reclock state we must kill any child process.
2737 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2739 struct ctdb_context *ctdb = state->ctdb;
2741 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2743 if (state->fd[0] != -1) {
2744 close(state->fd[0]);
2747 if (state->fd[1] != -1) {
2748 close(state->fd[1]);
2751 kill(state->child, SIGKILL);
2756 called if our check_reclock child times out. this would happen if
2757 i/o to the reclock file blocks.
2759 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2760 struct timeval t, void *private_data)
2762 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2763 struct ctdb_check_reclock_state);
2765 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2766 state->status = RECLOCK_TIMEOUT;
2769 /* this is called when the child process has completed checking the reclock
2770 file and has written data back to us through the pipe.
2772 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2773 uint16_t flags, void *private_data)
2775 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2776 struct ctdb_check_reclock_state);
2780 /* we got a response from our child process so we can abort the
2783 talloc_free(state->te);
2786 ret = read(state->fd[0], &c, 1);
2787 if (ret != 1 || c != RECLOCK_OK) {
2788 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2789 state->status = RECLOCK_FAILED;
2794 state->status = RECLOCK_OK;
2798 static int check_recovery_lock(struct ctdb_context *ctdb)
2801 struct ctdb_check_reclock_state *state;
2802 pid_t parent = getpid();
2804 if (ctdb->recovery_lock_fd == -1) {
2805 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2809 state = talloc(ctdb, struct ctdb_check_reclock_state);
2810 CTDB_NO_MEMORY(ctdb, state);
2813 state->start_time = timeval_current();
2814 state->status = RECLOCK_CHECKING;
2818 ret = pipe(state->fd);
2821 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2825 state->child = ctdb_fork(ctdb);
2826 if (state->child == (pid_t)-1) {
2827 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2828 close(state->fd[0]);
2830 close(state->fd[1]);
2836 if (state->child == 0) {
2837 char cc = RECLOCK_OK;
2838 close(state->fd[0]);
2841 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
2842 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2843 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2844 cc = RECLOCK_FAILED;
2847 write(state->fd[1], &cc, 1);
2848 /* make sure we die when our parent dies */
2849 while (kill(parent, 0) == 0 || errno != ESRCH) {
2851 write(state->fd[1], &cc, 1);
2855 close(state->fd[1]);
2857 set_close_on_exec(state->fd[0]);
2859 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2861 talloc_set_destructor(state, check_reclock_destructor);
2863 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2864 ctdb_check_reclock_timeout, state);
2865 if (state->te == NULL) {
2866 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2871 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2873 reclock_child_handler,
2876 if (state->fde == NULL) {
2877 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2881 tevent_fd_set_auto_close(state->fde);
2883 while (state->status == RECLOCK_CHECKING) {
2884 event_loop_once(ctdb->ev);
2887 if (state->status == RECLOCK_FAILED) {
2888 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2889 close(ctdb->recovery_lock_fd);
2890 ctdb->recovery_lock_fd = -1;
2899 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2901 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2902 const char *reclockfile;
2904 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2905 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2906 talloc_free(tmp_ctx);
2910 if (reclockfile == NULL) {
2911 if (ctdb->recovery_lock_file != NULL) {
2912 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2913 talloc_free(ctdb->recovery_lock_file);
2914 ctdb->recovery_lock_file = NULL;
2915 if (ctdb->recovery_lock_fd != -1) {
2916 close(ctdb->recovery_lock_fd);
2917 ctdb->recovery_lock_fd = -1;
2920 ctdb->tunable.verify_recovery_lock = 0;
2921 talloc_free(tmp_ctx);
2925 if (ctdb->recovery_lock_file == NULL) {
2926 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2927 if (ctdb->recovery_lock_fd != -1) {
2928 close(ctdb->recovery_lock_fd);
2929 ctdb->recovery_lock_fd = -1;
2931 talloc_free(tmp_ctx);
2936 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2937 talloc_free(tmp_ctx);
2941 talloc_free(ctdb->recovery_lock_file);
2942 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2943 ctdb->tunable.verify_recovery_lock = 0;
2944 if (ctdb->recovery_lock_fd != -1) {
2945 close(ctdb->recovery_lock_fd);
2946 ctdb->recovery_lock_fd = -1;
2949 talloc_free(tmp_ctx);
2953 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2954 TALLOC_CTX *mem_ctx)
2957 struct ctdb_node_map *nodemap=NULL;
2958 struct ctdb_node_map *recmaster_nodemap=NULL;
2959 struct ctdb_node_map **remote_nodemaps=NULL;
2960 struct ctdb_vnn_map *vnnmap=NULL;
2961 struct ctdb_vnn_map *remote_vnnmap=NULL;
2962 int32_t debug_level;
2967 /* verify that the main daemon is still running */
2968 if (kill(ctdb->ctdbd_pid, 0) != 0) {
2969 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2973 /* ping the local daemon to tell it we are alive */
2974 ctdb_ctrl_recd_ping(ctdb);
2976 if (rec->election_timeout) {
2977 /* an election is in progress */
2981 /* read the debug level from the parent and update locally */
2982 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2984 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2987 LogLevel = debug_level;
2990 /* We must check if we need to ban a node here but we want to do this
2991 as early as possible so we dont wait until we have pulled the node
2992 map from the local node. thats why we have the hardcoded value 20
2994 for (i=0; i<ctdb->num_nodes; i++) {
2995 struct ctdb_banning_state *ban_state;
2997 if (ctdb->nodes[i]->ban_state == NULL) {
3000 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3001 if (ban_state->count < 20) {
3004 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3005 ctdb->nodes[i]->pnn, ban_state->count,
3006 ctdb->tunable.recovery_ban_period));
3007 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3008 ban_state->count = 0;
3011 /* get relevant tunables */
3012 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3014 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3018 /* get the current recovery lock file from the server */
3019 if (update_recovery_lock_file(ctdb) != 0) {
3020 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3024 /* Make sure that if recovery lock verification becomes disabled when
3027 if (ctdb->tunable.verify_recovery_lock == 0) {
3028 if (ctdb->recovery_lock_fd != -1) {
3029 close(ctdb->recovery_lock_fd);
3030 ctdb->recovery_lock_fd = -1;
3034 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3035 if (pnn == (uint32_t)-1) {
3036 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3040 /* get the vnnmap */
3041 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3043 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3048 /* get number of nodes */
3050 talloc_free(rec->nodemap);
3051 rec->nodemap = NULL;
3054 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3056 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3059 nodemap = rec->nodemap;
3061 /* check which node is the recovery master */
3062 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3064 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3068 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3069 if (rec->recmaster != pnn) {
3070 if (rec->ip_reallocate_ctx != NULL) {
3071 talloc_free(rec->ip_reallocate_ctx);
3072 rec->ip_reallocate_ctx = NULL;
3073 rec->reallocate_callers = NULL;
3076 /* if there are takeovers requested, perform it and notify the waiters */
3077 if (rec->reallocate_callers) {
3078 process_ipreallocate_requests(ctdb, rec);
3081 if (rec->recmaster == (uint32_t)-1) {
3082 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3083 force_election(rec, pnn, nodemap);
3088 /* if the local daemon is STOPPED, we verify that the databases are
3089 also frozen and thet the recmode is set to active
3091 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3092 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3094 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3096 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3097 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3099 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3101 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3104 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3106 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3113 /* If the local node is stopped, verify we are not the recmaster
3114 and yield this role if so
3116 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
3117 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
3118 force_election(rec, pnn, nodemap);
3122 /* check that we (recovery daemon) and the local ctdb daemon
3123 agrees on whether we are banned or not
3127 /* remember our own node flags */
3128 rec->node_flags = nodemap->nodes[pnn].flags;
3130 /* count how many active nodes there are */
3131 rec->num_active = 0;
3132 rec->num_connected = 0;
3133 for (i=0; i<nodemap->num; i++) {
3134 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3137 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3138 rec->num_connected++;
3143 /* verify that the recmaster node is still active */
3144 for (j=0; j<nodemap->num; j++) {
3145 if (nodemap->nodes[j].pnn==rec->recmaster) {
3150 if (j == nodemap->num) {
3151 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3152 force_election(rec, pnn, nodemap);
3156 /* if recovery master is disconnected we must elect a new recmaster */
3157 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3158 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3159 force_election(rec, pnn, nodemap);
3163 /* grap the nodemap from the recovery master to check if it is banned */
3164 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3165 mem_ctx, &recmaster_nodemap);
3167 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3168 nodemap->nodes[j].pnn));
3173 if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3174 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3175 force_election(rec, pnn, nodemap);
3180 /* verify that we have all ip addresses we should have and we dont
3181 * have addresses we shouldnt have.
3183 if (ctdb->tunable.disable_ip_failover == 0) {
3184 if (rec->ip_check_disable_ctx == NULL) {
3185 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3186 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3192 /* if we are not the recmaster then we do not need to check
3193 if recovery is needed
3195 if (pnn != rec->recmaster) {
3200 /* ensure our local copies of flags are right */
3201 ret = update_local_flags(rec, nodemap);
3202 if (ret == MONITOR_ELECTION_NEEDED) {
3203 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3204 force_election(rec, pnn, nodemap);
3207 if (ret != MONITOR_OK) {
3208 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3212 if (ctdb->num_nodes != nodemap->num) {
3213 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3214 reload_nodes_file(ctdb);
3218 /* verify that all active nodes agree that we are the recmaster */
3219 switch (verify_recmaster(rec, nodemap, pnn)) {
3220 case MONITOR_RECOVERY_NEEDED:
3221 /* can not happen */
3223 case MONITOR_ELECTION_NEEDED:
3224 force_election(rec, pnn, nodemap);
3228 case MONITOR_FAILED:
3233 if (rec->need_recovery) {
3234 /* a previous recovery didn't finish */
3235 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3239 /* verify that all active nodes are in normal mode
3240 and not in recovery mode
3242 switch (verify_recmode(ctdb, nodemap)) {
3243 case MONITOR_RECOVERY_NEEDED:
3244 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3246 case MONITOR_FAILED:
3248 case MONITOR_ELECTION_NEEDED:
3249 /* can not happen */
3255 if (ctdb->tunable.verify_recovery_lock != 0) {
3256 /* we should have the reclock - check its not stale */
3257 ret = check_recovery_lock(ctdb);
3259 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3260 ctdb_set_culprit(rec, ctdb->pnn);
3261 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3266 /* get the nodemap for all active remote nodes
3268 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3269 if (remote_nodemaps == NULL) {
3270 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3273 for(i=0; i<nodemap->num; i++) {
3274 remote_nodemaps[i] = NULL;
3276 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3277 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3281 /* verify that all other nodes have the same nodemap as we have
3283 for (j=0; j<nodemap->num; j++) {
3284 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3288 if (remote_nodemaps[j] == NULL) {
3289 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3290 ctdb_set_culprit(rec, j);
3295 /* if the nodes disagree on how many nodes there are
3296 then this is a good reason to try recovery
3298 if (remote_nodemaps[j]->num != nodemap->num) {
3299 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3300 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3301 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3302 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3306 /* if the nodes disagree on which nodes exist and are
3307 active, then that is also a good reason to do recovery
3309 for (i=0;i<nodemap->num;i++) {
3310 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3311 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3312 nodemap->nodes[j].pnn, i,
3313 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3314 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3315 do_recovery(rec, mem_ctx, pnn, nodemap,
3321 /* verify the flags are consistent
3323 for (i=0; i<nodemap->num; i++) {
3324 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3328 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3329 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3330 nodemap->nodes[j].pnn,
3331 nodemap->nodes[i].pnn,
3332 remote_nodemaps[j]->nodes[i].flags,
3333 nodemap->nodes[j].flags));
3335 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3336 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3337 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3338 do_recovery(rec, mem_ctx, pnn, nodemap,
3342 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3343 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3344 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3345 do_recovery(rec, mem_ctx, pnn, nodemap,
3354 /* there better be the same number of lmasters in the vnn map
3355 as there are active nodes or we will have to do a recovery
3357 if (vnnmap->size != rec->num_active) {
3358 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3359 vnnmap->size, rec->num_active));
3360 ctdb_set_culprit(rec, ctdb->pnn);
3361 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3365 /* verify that all active nodes in the nodemap also exist in
3368 for (j=0; j<nodemap->num; j++) {
3369 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3372 if (nodemap->nodes[j].pnn == pnn) {
3376 for (i=0; i<vnnmap->size; i++) {
3377 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3381 if (i == vnnmap->size) {
3382 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3383 nodemap->nodes[j].pnn));
3384 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3385 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3391 /* verify that all other nodes have the same vnnmap
3392 and are from the same generation
3394 for (j=0; j<nodemap->num; j++) {
3395 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3398 if (nodemap->nodes[j].pnn == pnn) {
3402 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3403 mem_ctx, &remote_vnnmap);
3405 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3406 nodemap->nodes[j].pnn));
3410 /* verify the vnnmap generation is the same */
3411 if (vnnmap->generation != remote_vnnmap->generation) {
3412 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3413 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3414 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3415 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3419 /* verify the vnnmap size is the same */
3420 if (vnnmap->size != remote_vnnmap->size) {
3421 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3422 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3423 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3424 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3428 /* verify the vnnmap is the same */
3429 for (i=0;i<vnnmap->size;i++) {
3430 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3431 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3432 nodemap->nodes[j].pnn));
3433 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3434 do_recovery(rec, mem_ctx, pnn, nodemap,
3441 /* we might need to change who has what IP assigned */
3442 if (rec->need_takeover_run) {
3443 uint32_t culprit = (uint32_t)-1;
3445 rec->need_takeover_run = false;
3447 /* update the list of public ips that a node can handle for
3450 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3452 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3454 rec->need_takeover_run = true;
3458 /* execute the "startrecovery" event script on all nodes */
3459 ret = run_startrecovery_eventscript(rec, nodemap);
3461 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3462 ctdb_set_culprit(rec, ctdb->pnn);
3463 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3467 ret = ctdb_takeover_run(ctdb, nodemap);
3469 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
3473 /* execute the "recovered" event script on all nodes */
3474 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3476 // we cant check whether the event completed successfully
3477 // since this script WILL fail if the node is in recovery mode
3478 // and if that race happens, the code here would just cause a second
3479 // cascading recovery.
3481 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3482 ctdb_set_culprit(rec, ctdb->pnn);
3483 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3490 the main monitoring loop
3492 static void monitor_cluster(struct ctdb_context *ctdb)
3494 struct ctdb_recoverd *rec;
3496 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3498 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3499 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3503 rec->priority_time = timeval_current();
3505 /* register a message port for sending memory dumps */
3506 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3508 /* register a message port for recovery elections */
3509 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3511 /* when nodes are disabled/enabled */
3512 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3514 /* when we are asked to puch out a flag change */
3515 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3517 /* register a message port for vacuum fetch */
3518 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3520 /* register a message port for reloadnodes */
3521 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3523 /* register a message port for performing a takeover run */
3524 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3526 /* register a message port for disabling the ip check for a short while */
3527 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3529 /* register a message port for updating the recovery daemons node assignment for an ip */
3530 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3532 /* register a message port for forcing a rebalance of a node next
3534 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3537 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3538 struct timeval start;
3542 DEBUG(DEBUG_CRIT,(__location__
3543 " Failed to create temp context\n"));
3547 start = timeval_current();
3548 main_loop(ctdb, rec, mem_ctx);
3549 talloc_free(mem_ctx);
3551 /* we only check for recovery once every second */
3552 elapsed = timeval_elapsed(&start);
3553 if (elapsed < ctdb->tunable.recover_interval) {
3554 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3561 event handler for when the main ctdbd dies
3563 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3564 uint16_t flags, void *private_data)
3566 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3571 called regularly to verify that the recovery daemon is still running
3573 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3574 struct timeval yt, void *p)
3576 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3578 if (kill(ctdb->recoverd_pid, 0) != 0) {
3579 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3581 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3582 ctdb_restart_recd, ctdb);
3587 event_add_timed(ctdb->ev, ctdb,
3588 timeval_current_ofs(30, 0),
3589 ctdb_check_recd, ctdb);
3592 static void recd_sig_child_handler(struct event_context *ev,
3593 struct signal_event *se, int signum, int count,
3597 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3602 pid = waitpid(-1, &status, WNOHANG);
3604 if (errno != ECHILD) {
3605 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3610 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3616 startup the recovery daemon as a child of the main ctdb daemon
3618 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3621 struct signal_event *se;
3622 struct tevent_fd *fde;
3624 if (pipe(fd) != 0) {
3628 ctdb->ctdbd_pid = getpid();
3630 ctdb->recoverd_pid = fork();
3631 if (ctdb->recoverd_pid == -1) {
3635 if (ctdb->recoverd_pid != 0) {
3637 event_add_timed(ctdb->ev, ctdb,
3638 timeval_current_ofs(30, 0),
3639 ctdb_check_recd, ctdb);
3645 srandom(getpid() ^ time(NULL));
3647 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3648 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3652 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3654 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3655 ctdb_recoverd_parent, &fd[0]);
3656 tevent_fd_set_auto_close(fde);
3658 /* set up a handler to pick up sigchld */
3659 se = event_add_signal(ctdb->ev, ctdb,
3661 recd_sig_child_handler,
3664 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3668 monitor_cluster(ctdb);
3670 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3675 shutdown the recovery daemon
3677 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3679 if (ctdb->recoverd_pid == 0) {
3683 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3684 kill(ctdb->recoverd_pid, SIGTERM);
3687 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
3688 struct timeval t, void *private_data)
3690 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3692 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3693 ctdb_stop_recoverd(ctdb);
3694 ctdb_start_recoverd(ctdb);