4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* most recent reload all ips request we need to perform during the
36 struct reloadips_all_reply *reload_all_ips_request = NULL;
38 /* list of "ctdb ipreallocate" processes to call back when we have
39 finished the takeover run.
41 struct ip_reallocate_list {
42 struct ip_reallocate_list *next;
43 struct rd_memdump_reply *rd;
46 struct ctdb_banning_state {
48 struct timeval last_reported_time;
52 private state of recovery daemon
54 struct ctdb_recoverd {
55 struct ctdb_context *ctdb;
58 uint32_t num_connected;
59 uint32_t last_culprit_node;
60 struct ctdb_node_map *nodemap;
61 struct timeval priority_time;
62 bool need_takeover_run;
65 struct timed_event *send_election_te;
66 struct timed_event *election_timeout;
67 struct vacuum_info *vacuum_info;
68 TALLOC_CTX *ip_reallocate_ctx;
69 struct ip_reallocate_list *reallocate_callers;
70 TALLOC_CTX *ip_check_disable_ctx;
71 struct ctdb_control_get_ifaces *ifaces;
72 TALLOC_CTX *deferred_rebalance_ctx;
75 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
76 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
78 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
81 ban a node for a period of time
83 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
86 struct ctdb_context *ctdb = rec->ctdb;
87 struct ctdb_ban_time bantime;
89 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
91 if (!ctdb_validate_pnn(ctdb, pnn)) {
92 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
97 bantime.time = ban_time;
99 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
101 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
107 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
111 run the "recovered" eventscript on all nodes
113 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
118 tmp_ctx = talloc_new(ctdb);
119 CTDB_NO_MEMORY(ctdb, tmp_ctx);
121 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
122 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
124 CONTROL_TIMEOUT(), false, tdb_null,
127 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
129 talloc_free(tmp_ctx);
133 talloc_free(tmp_ctx);
138 remember the trouble maker
140 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
142 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
143 struct ctdb_banning_state *ban_state;
145 if (culprit > ctdb->num_nodes) {
146 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
150 if (ctdb->nodes[culprit]->ban_state == NULL) {
151 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
152 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
156 ban_state = ctdb->nodes[culprit]->ban_state;
157 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
158 /* this was the first time in a long while this node
159 misbehaved so we will forgive any old transgressions.
161 ban_state->count = 0;
164 ban_state->count += count;
165 ban_state->last_reported_time = timeval_current();
166 rec->last_culprit_node = culprit;
170 remember the trouble maker
172 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
174 ctdb_set_culprit_count(rec, culprit, 1);
178 /* this callback is called for every node that failed to execute the
181 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
183 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
185 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
187 ctdb_set_culprit(rec, node_pnn);
191 run the "startrecovery" eventscript on all nodes
193 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
197 struct ctdb_context *ctdb = rec->ctdb;
199 tmp_ctx = talloc_new(ctdb);
200 CTDB_NO_MEMORY(ctdb, tmp_ctx);
202 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
203 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
205 CONTROL_TIMEOUT(), false, tdb_null,
207 startrecovery_fail_callback,
209 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
210 talloc_free(tmp_ctx);
214 talloc_free(tmp_ctx);
218 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
220 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
221 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
224 if (node_pnn < ctdb->num_nodes) {
225 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
228 if (node_pnn == ctdb->pnn) {
229 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
234 update the node capabilities for all connected nodes
236 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
241 tmp_ctx = talloc_new(ctdb);
242 CTDB_NO_MEMORY(ctdb, tmp_ctx);
244 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
245 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
249 async_getcap_callback, NULL,
251 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
252 talloc_free(tmp_ctx);
256 talloc_free(tmp_ctx);
260 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
262 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
264 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
265 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
268 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
270 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
272 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
273 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
277 change recovery mode on all nodes
279 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
285 tmp_ctx = talloc_new(ctdb);
286 CTDB_NO_MEMORY(ctdb, tmp_ctx);
288 /* freeze all nodes */
289 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
290 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
293 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
294 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
299 set_recmode_fail_callback,
301 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
302 talloc_free(tmp_ctx);
309 data.dsize = sizeof(uint32_t);
310 data.dptr = (unsigned char *)&rec_mode;
312 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
318 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
319 talloc_free(tmp_ctx);
323 talloc_free(tmp_ctx);
328 change recovery master on all node
330 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
336 tmp_ctx = talloc_new(ctdb);
337 CTDB_NO_MEMORY(ctdb, tmp_ctx);
339 data.dsize = sizeof(uint32_t);
340 data.dptr = (unsigned char *)&pnn;
342 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
343 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
345 CONTROL_TIMEOUT(), false, data,
348 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
349 talloc_free(tmp_ctx);
353 talloc_free(tmp_ctx);
357 /* update all remote nodes to use the same db priority that we have
358 this can fail if the remove node has not yet been upgraded to
359 support this function, so we always return success and never fail
360 a recovery if this call fails.
362 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
363 struct ctdb_node_map *nodemap,
364 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
369 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
371 /* step through all local databases */
372 for (db=0; db<dbmap->num;db++) {
374 struct ctdb_db_priority db_prio;
377 db_prio.db_id = dbmap->dbs[db].dbid;
378 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
380 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
384 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
386 data.dptr = (uint8_t *)&db_prio;
387 data.dsize = sizeof(db_prio);
389 if (ctdb_client_async_control(ctdb,
390 CTDB_CONTROL_SET_DB_PRIORITY,
392 CONTROL_TIMEOUT(), false, data,
395 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
403 ensure all other nodes have attached to any databases that we have
405 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
406 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
409 struct ctdb_dbid_map *remote_dbmap;
411 /* verify that all other nodes have all our databases */
412 for (j=0; j<nodemap->num; j++) {
413 /* we dont need to ourself ourselves */
414 if (nodemap->nodes[j].pnn == pnn) {
417 /* dont check nodes that are unavailable */
418 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
422 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
423 mem_ctx, &remote_dbmap);
425 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
429 /* step through all local databases */
430 for (db=0; db<dbmap->num;db++) {
434 for (i=0;i<remote_dbmap->num;i++) {
435 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
439 /* the remote node already have this database */
440 if (i!=remote_dbmap->num) {
443 /* ok so we need to create this database */
444 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
447 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
450 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
452 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
454 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
465 ensure we are attached to any databases that anyone else is attached to
467 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
468 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
471 struct ctdb_dbid_map *remote_dbmap;
473 /* verify that we have all database any other node has */
474 for (j=0; j<nodemap->num; j++) {
475 /* we dont need to ourself ourselves */
476 if (nodemap->nodes[j].pnn == pnn) {
479 /* dont check nodes that are unavailable */
480 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
484 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
485 mem_ctx, &remote_dbmap);
487 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
491 /* step through all databases on the remote node */
492 for (db=0; db<remote_dbmap->num;db++) {
495 for (i=0;i<(*dbmap)->num;i++) {
496 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
500 /* we already have this db locally */
501 if (i!=(*dbmap)->num) {
504 /* ok so we need to create this database and
507 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
511 nodemap->nodes[j].pnn));
514 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
515 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
517 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
520 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
522 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
533 pull the remote database contents from one node into the recdb
535 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
536 struct tdb_wrap *recdb, uint32_t dbid)
540 struct ctdb_marshall_buffer *reply;
541 struct ctdb_rec_data *rec;
543 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
545 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
546 CONTROL_TIMEOUT(), &outdata);
548 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
549 talloc_free(tmp_ctx);
553 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
555 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
556 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
557 talloc_free(tmp_ctx);
561 rec = (struct ctdb_rec_data *)&reply->data[0];
565 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
567 struct ctdb_ltdb_header *hdr;
570 key.dptr = &rec->data[0];
571 key.dsize = rec->keylen;
572 data.dptr = &rec->data[key.dsize];
573 data.dsize = rec->datalen;
575 hdr = (struct ctdb_ltdb_header *)data.dptr;
577 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
578 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
579 talloc_free(tmp_ctx);
583 /* fetch the existing record, if any */
584 existing = tdb_fetch(recdb->tdb, key);
586 if (existing.dptr != NULL) {
587 struct ctdb_ltdb_header header;
588 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
589 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
590 (unsigned)existing.dsize, srcnode));
592 talloc_free(tmp_ctx);
595 header = *(struct ctdb_ltdb_header *)existing.dptr;
597 if (!(header.rsn < hdr->rsn ||
598 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
603 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
604 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
605 talloc_free(tmp_ctx);
610 talloc_free(tmp_ctx);
616 struct pull_seqnum_cbdata {
622 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
624 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
627 if (cb_data->failed != 0) {
628 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
633 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
638 if (outdata.dsize != sizeof(uint64_t)) {
639 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
640 cb_data->failed = -1;
644 seqnum = *((uint64_t *)outdata.dptr);
646 if (seqnum > cb_data->seqnum) {
647 cb_data->seqnum = seqnum;
648 cb_data->pnn = node_pnn;
652 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
654 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
656 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
660 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
661 struct ctdb_recoverd *rec,
662 struct ctdb_node_map *nodemap,
663 struct tdb_wrap *recdb, uint32_t dbid)
665 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
669 struct pull_seqnum_cbdata *cb_data;
671 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
676 data.dsize = sizeof(outdata);
677 data.dptr = (uint8_t *)&outdata[0];
679 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
680 if (cb_data == NULL) {
681 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
682 talloc_free(tmp_ctx);
690 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
691 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
693 CONTROL_TIMEOUT(), false, data,
697 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
699 talloc_free(tmp_ctx);
703 if (cb_data->failed != 0) {
704 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
705 talloc_free(tmp_ctx);
709 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
710 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
711 talloc_free(tmp_ctx);
715 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
717 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
718 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
719 talloc_free(tmp_ctx);
723 talloc_free(tmp_ctx);
729 pull all the remote database contents into the recdb
731 static int pull_remote_database(struct ctdb_context *ctdb,
732 struct ctdb_recoverd *rec,
733 struct ctdb_node_map *nodemap,
734 struct tdb_wrap *recdb, uint32_t dbid,
739 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
741 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
747 /* pull all records from all other nodes across onto this node
748 (this merges based on rsn)
750 for (j=0; j<nodemap->num; j++) {
751 /* dont merge from nodes that are unavailable */
752 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
755 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
756 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
757 nodemap->nodes[j].pnn));
758 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
768 update flags on all active nodes
770 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
774 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
776 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
784 ensure all nodes have the same vnnmap we do
786 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
787 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
791 /* push the new vnn map out to all the nodes */
792 for (j=0; j<nodemap->num; j++) {
793 /* dont push to nodes that are unavailable */
794 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
798 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
800 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
810 struct vacuum_info *next, *prev;
811 struct ctdb_recoverd *rec;
813 struct ctdb_db_context *ctdb_db;
814 struct ctdb_marshall_buffer *recs;
815 struct ctdb_rec_data *r;
818 static void vacuum_fetch_next(struct vacuum_info *v);
821 called when a vacuum fetch has completed - just free it and do the next one
823 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
825 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
827 vacuum_fetch_next(v);
832 process the next element from the vacuum list
834 static void vacuum_fetch_next(struct vacuum_info *v)
836 struct ctdb_call call;
837 struct ctdb_rec_data *r;
839 while (v->recs->count) {
840 struct ctdb_client_call_state *state;
842 struct ctdb_ltdb_header *hdr;
845 call.call_id = CTDB_NULL_FUNC;
846 call.flags = CTDB_IMMEDIATE_MIGRATION;
847 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
850 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
853 call.key.dptr = &r->data[0];
854 call.key.dsize = r->keylen;
856 /* ensure we don't block this daemon - just skip a record if we can't get
858 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
862 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
863 if (data.dptr == NULL) {
864 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
868 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
870 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
874 hdr = (struct ctdb_ltdb_header *)data.dptr;
875 if (hdr->dmaster == v->rec->ctdb->pnn) {
876 /* its already local */
878 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
884 state = ctdb_call_send(v->ctdb_db, &call);
885 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
887 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
891 state->async.fn = vacuum_fetch_callback;
892 state->async.private_data = v;
901 destroy a vacuum info structure
903 static int vacuum_info_destructor(struct vacuum_info *v)
905 DLIST_REMOVE(v->rec->vacuum_info, v);
911 handler for vacuum fetch
913 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
914 TDB_DATA data, void *private_data)
916 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
917 struct ctdb_marshall_buffer *recs;
919 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
921 struct ctdb_dbid_map *dbmap=NULL;
922 bool persistent = false;
923 struct ctdb_db_context *ctdb_db;
924 struct ctdb_rec_data *r;
926 struct vacuum_info *v;
928 recs = (struct ctdb_marshall_buffer *)data.dptr;
929 r = (struct ctdb_rec_data *)&recs->data[0];
931 if (recs->count == 0) {
932 talloc_free(tmp_ctx);
938 for (v=rec->vacuum_info;v;v=v->next) {
939 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
940 /* we're already working on records from this node */
941 talloc_free(tmp_ctx);
946 /* work out if the database is persistent */
947 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
949 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
950 talloc_free(tmp_ctx);
954 for (i=0;i<dbmap->num;i++) {
955 if (dbmap->dbs[i].dbid == recs->db_id) {
956 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
960 if (i == dbmap->num) {
961 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
962 talloc_free(tmp_ctx);
966 /* find the name of this database */
967 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
968 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
969 talloc_free(tmp_ctx);
974 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
975 if (ctdb_db == NULL) {
976 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
977 talloc_free(tmp_ctx);
981 v = talloc_zero(rec, struct vacuum_info);
983 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
984 talloc_free(tmp_ctx);
989 v->srcnode = srcnode;
990 v->ctdb_db = ctdb_db;
991 v->recs = talloc_memdup(v, recs, data.dsize);
992 if (v->recs == NULL) {
993 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
995 talloc_free(tmp_ctx);
998 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1000 DLIST_ADD(rec->vacuum_info, v);
1002 talloc_set_destructor(v, vacuum_info_destructor);
1004 vacuum_fetch_next(v);
1005 talloc_free(tmp_ctx);
1010 called when ctdb_wait_timeout should finish
1012 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1013 struct timeval yt, void *p)
1015 uint32_t *timed_out = (uint32_t *)p;
1020 wait for a given number of seconds
1022 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1024 uint32_t timed_out = 0;
1025 time_t usecs = (secs - (time_t)secs) * 1000000;
1026 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1027 while (!timed_out) {
1028 event_loop_once(ctdb->ev);
1033 called when an election times out (ends)
1035 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1036 struct timeval t, void *p)
1038 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1039 rec->election_timeout = NULL;
1042 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1047 wait for an election to finish. It finished election_timeout seconds after
1048 the last election packet is received
1050 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1052 struct ctdb_context *ctdb = rec->ctdb;
1053 while (rec->election_timeout) {
1054 event_loop_once(ctdb->ev);
1059 Update our local flags from all remote connected nodes.
1060 This is only run when we are or we belive we are the recovery master
1062 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1065 struct ctdb_context *ctdb = rec->ctdb;
1066 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1068 /* get the nodemap for all active remote nodes and verify
1069 they are the same as for this node
1071 for (j=0; j<nodemap->num; j++) {
1072 struct ctdb_node_map *remote_nodemap=NULL;
1075 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1078 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1082 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1083 mem_ctx, &remote_nodemap);
1085 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1086 nodemap->nodes[j].pnn));
1087 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1088 talloc_free(mem_ctx);
1089 return MONITOR_FAILED;
1091 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1092 /* We should tell our daemon about this so it
1093 updates its flags or else we will log the same
1094 message again in the next iteration of recovery.
1095 Since we are the recovery master we can just as
1096 well update the flags on all nodes.
1098 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1100 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1104 /* Update our local copy of the flags in the recovery
1107 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1108 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1109 nodemap->nodes[j].flags));
1110 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1112 talloc_free(remote_nodemap);
1114 talloc_free(mem_ctx);
1119 /* Create a new random generation ip.
1120 The generation id can not be the INVALID_GENERATION id
1122 static uint32_t new_generation(void)
1124 uint32_t generation;
1127 generation = random();
1129 if (generation != INVALID_GENERATION) {
1139 create a temporary working database
1141 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1144 struct tdb_wrap *recdb;
1147 /* open up the temporary recovery database */
1148 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1149 ctdb->db_directory_state,
1156 tdb_flags = TDB_NOLOCK;
1157 if (ctdb->valgrinding) {
1158 tdb_flags |= TDB_NOMMAP;
1160 tdb_flags |= TDB_DISALLOW_NESTING;
1162 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1163 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1164 if (recdb == NULL) {
1165 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1175 a traverse function for pulling all relevent records from recdb
1178 struct ctdb_context *ctdb;
1179 struct ctdb_marshall_buffer *recdata;
1181 uint32_t allocated_len;
1186 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1188 struct recdb_data *params = (struct recdb_data *)p;
1189 struct ctdb_rec_data *rec;
1190 struct ctdb_ltdb_header *hdr;
1192 /* skip empty records */
1193 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1197 /* update the dmaster field to point to us */
1198 hdr = (struct ctdb_ltdb_header *)data.dptr;
1199 if (!params->persistent) {
1200 hdr->dmaster = params->ctdb->pnn;
1201 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1204 /* add the record to the blob ready to send to the nodes */
1205 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1207 params->failed = true;
1210 if (params->len + rec->length >= params->allocated_len) {
1211 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1212 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1214 if (params->recdata == NULL) {
1215 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1216 rec->length + params->len, params->recdata->count));
1217 params->failed = true;
1220 params->recdata->count++;
1221 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1222 params->len += rec->length;
1229 push the recdb database out to all nodes
1231 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1233 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1235 struct recdb_data params;
1236 struct ctdb_marshall_buffer *recdata;
1238 TALLOC_CTX *tmp_ctx;
1241 tmp_ctx = talloc_new(ctdb);
1242 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1244 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1245 CTDB_NO_MEMORY(ctdb, recdata);
1247 recdata->db_id = dbid;
1250 params.recdata = recdata;
1251 params.len = offsetof(struct ctdb_marshall_buffer, data);
1252 params.allocated_len = params.len;
1253 params.failed = false;
1254 params.persistent = persistent;
1256 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1257 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1258 talloc_free(params.recdata);
1259 talloc_free(tmp_ctx);
1263 if (params.failed) {
1264 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1265 talloc_free(params.recdata);
1266 talloc_free(tmp_ctx);
1270 recdata = params.recdata;
1272 outdata.dptr = (void *)recdata;
1273 outdata.dsize = params.len;
1275 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1276 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1278 CONTROL_TIMEOUT(), false, outdata,
1281 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1282 talloc_free(recdata);
1283 talloc_free(tmp_ctx);
1287 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1288 dbid, recdata->count));
1290 talloc_free(recdata);
1291 talloc_free(tmp_ctx);
1298 go through a full recovery on one database
1300 static int recover_database(struct ctdb_recoverd *rec,
1301 TALLOC_CTX *mem_ctx,
1305 struct ctdb_node_map *nodemap,
1306 uint32_t transaction_id)
1308 struct tdb_wrap *recdb;
1310 struct ctdb_context *ctdb = rec->ctdb;
1312 struct ctdb_control_wipe_database w;
1315 recdb = create_recdb(ctdb, mem_ctx);
1316 if (recdb == NULL) {
1320 /* pull all remote databases onto the recdb */
1321 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1323 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1327 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1329 /* wipe all the remote databases. This is safe as we are in a transaction */
1331 w.transaction_id = transaction_id;
1333 data.dptr = (void *)&w;
1334 data.dsize = sizeof(w);
1336 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1337 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1339 CONTROL_TIMEOUT(), false, data,
1342 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1347 /* push out the correct database. This sets the dmaster and skips
1348 the empty records */
1349 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1355 /* all done with this database */
1362 reload the nodes file
1364 static void reload_nodes_file(struct ctdb_context *ctdb)
1367 ctdb_load_nodes_file(ctdb);
1370 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1371 struct ctdb_recoverd *rec,
1372 struct ctdb_node_map *nodemap,
1378 if (ctdb->num_nodes != nodemap->num) {
1379 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1380 ctdb->num_nodes, nodemap->num));
1382 *culprit = ctdb->pnn;
1387 for (j=0; j<nodemap->num; j++) {
1388 /* release any existing data */
1389 if (ctdb->nodes[j]->known_public_ips) {
1390 talloc_free(ctdb->nodes[j]->known_public_ips);
1391 ctdb->nodes[j]->known_public_ips = NULL;
1393 if (ctdb->nodes[j]->available_public_ips) {
1394 talloc_free(ctdb->nodes[j]->available_public_ips);
1395 ctdb->nodes[j]->available_public_ips = NULL;
1398 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1402 /* grab a new shiny list of public ips from the node */
1403 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1405 ctdb->nodes[j]->pnn,
1408 &ctdb->nodes[j]->known_public_ips);
1410 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1411 ctdb->nodes[j]->pnn));
1413 *culprit = ctdb->nodes[j]->pnn;
1418 if (ctdb->do_checkpublicip) {
1419 if (rec->ip_check_disable_ctx == NULL) {
1420 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1421 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1422 rec->need_takeover_run = true;
1427 /* grab a new shiny list of public ips from the node */
1428 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1430 ctdb->nodes[j]->pnn,
1432 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1433 &ctdb->nodes[j]->available_public_ips);
1435 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1436 ctdb->nodes[j]->pnn));
1438 *culprit = ctdb->nodes[j]->pnn;
1447 /* when we start a recovery, make sure all nodes use the same reclock file
1450 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1452 struct ctdb_context *ctdb = rec->ctdb;
1453 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1457 if (ctdb->recovery_lock_file == NULL) {
1461 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1462 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1465 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1466 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1472 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1473 talloc_free(tmp_ctx);
1477 talloc_free(tmp_ctx);
1483 we are the recmaster, and recovery is needed - start a recovery run
1485 static int do_recovery(struct ctdb_recoverd *rec,
1486 TALLOC_CTX *mem_ctx, uint32_t pnn,
1487 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1489 struct ctdb_context *ctdb = rec->ctdb;
1491 uint32_t generation;
1492 struct ctdb_dbid_map *dbmap;
1495 struct timeval start_time;
1496 uint32_t culprit = (uint32_t)-1;
1498 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1500 /* if recovery fails, force it again */
1501 rec->need_recovery = true;
1503 for (i=0; i<ctdb->num_nodes; i++) {
1504 struct ctdb_banning_state *ban_state;
1506 if (ctdb->nodes[i]->ban_state == NULL) {
1509 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1510 if (ban_state->count < 2*ctdb->num_nodes) {
1513 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1514 ctdb->nodes[i]->pnn, ban_state->count,
1515 ctdb->tunable.recovery_ban_period));
1516 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1517 ban_state->count = 0;
1521 if (ctdb->tunable.verify_recovery_lock != 0) {
1522 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1523 start_time = timeval_current();
1524 if (!ctdb_recovery_lock(ctdb, true)) {
1525 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1526 "and ban ourself for %u seconds\n",
1527 ctdb->tunable.recovery_ban_period));
1528 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1531 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1532 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1535 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1537 /* get a list of all databases */
1538 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1540 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1544 /* we do the db creation before we set the recovery mode, so the freeze happens
1545 on all databases we will be dealing with. */
1547 /* verify that we have all the databases any other node has */
1548 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1550 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1554 /* verify that all other nodes have all our databases */
1555 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1557 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1560 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1562 /* update the database priority for all remote databases */
1563 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1565 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1567 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1570 /* update all other nodes to use the same setting for reclock files
1571 as the local recovery master.
1573 sync_recovery_lock_file_across_cluster(rec);
1575 /* set recovery mode to active on all nodes */
1576 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1578 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1582 /* execute the "startrecovery" event script on all nodes */
1583 ret = run_startrecovery_eventscript(rec, nodemap);
1585 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1590 update all nodes to have the same flags that we have
1592 for (i=0;i<nodemap->num;i++) {
1593 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1597 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1599 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1604 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1606 /* pick a new generation number */
1607 generation = new_generation();
1609 /* change the vnnmap on this node to use the new generation
1610 number but not on any other nodes.
1611 this guarantees that if we abort the recovery prematurely
1612 for some reason (a node stops responding?)
1613 that we can just return immediately and we will reenter
1614 recovery shortly again.
1615 I.e. we deliberately leave the cluster with an inconsistent
1616 generation id to allow us to abort recovery at any stage and
1617 just restart it from scratch.
1619 vnnmap->generation = generation;
1620 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1622 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1626 data.dptr = (void *)&generation;
1627 data.dsize = sizeof(uint32_t);
1629 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1630 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1632 CONTROL_TIMEOUT(), false, data,
1634 transaction_start_fail_callback,
1636 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1637 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1639 CONTROL_TIMEOUT(), false, tdb_null,
1643 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1648 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1650 for (i=0;i<dbmap->num;i++) {
1651 ret = recover_database(rec, mem_ctx,
1653 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1654 pnn, nodemap, generation);
1656 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1661 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1663 /* commit all the changes */
1664 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1666 CONTROL_TIMEOUT(), false, data,
1669 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1673 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1676 /* update the capabilities for all nodes */
1677 ret = update_capabilities(ctdb, nodemap);
1679 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1683 /* build a new vnn map with all the currently active and
1685 generation = new_generation();
1686 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1687 CTDB_NO_MEMORY(ctdb, vnnmap);
1688 vnnmap->generation = generation;
1690 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1691 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1692 for (i=j=0;i<nodemap->num;i++) {
1693 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1696 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1697 /* this node can not be an lmaster */
1698 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1703 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1704 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1705 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1708 if (vnnmap->size == 0) {
1709 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1711 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1712 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1713 vnnmap->map[0] = pnn;
1716 /* update to the new vnnmap on all nodes */
1717 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1719 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1723 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1725 /* update recmaster to point to us for all nodes */
1726 ret = set_recovery_master(ctdb, nodemap, pnn);
1728 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1732 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1735 update all nodes to have the same flags that we have
1737 for (i=0;i<nodemap->num;i++) {
1738 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1742 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1744 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1749 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1751 /* disable recovery mode */
1752 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1754 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1758 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1761 tell nodes to takeover their public IPs
1763 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1765 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1767 rec->need_takeover_run = true;
1770 rec->need_takeover_run = false;
1771 ret = ctdb_takeover_run(ctdb, nodemap);
1773 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1774 rec->need_takeover_run = true;
1777 /* execute the "recovered" event script on all nodes */
1778 ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1780 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1784 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1786 /* send a message to all clients telling them that the cluster
1787 has been reconfigured */
1788 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1790 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1792 rec->need_recovery = false;
1794 /* we managed to complete a full recovery, make sure to forgive
1795 any past sins by the nodes that could now participate in the
1798 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1799 for (i=0;i<nodemap->num;i++) {
1800 struct ctdb_banning_state *ban_state;
1802 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1806 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1807 if (ban_state == NULL) {
1811 ban_state->count = 0;
1815 /* We just finished a recovery successfully.
1816 We now wait for rerecovery_timeout before we allow
1817 another recovery to take place.
1819 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1820 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1821 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1828 elections are won by first checking the number of connected nodes, then
1829 the priority time, then the pnn
1831 struct election_message {
1832 uint32_t num_connected;
1833 struct timeval priority_time;
1835 uint32_t node_flags;
1839 form this nodes election data
1841 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1844 struct ctdb_node_map *nodemap;
1845 struct ctdb_context *ctdb = rec->ctdb;
1849 em->pnn = rec->ctdb->pnn;
1850 em->priority_time = rec->priority_time;
1852 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1854 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1858 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1859 em->node_flags = rec->node_flags;
1861 for (i=0;i<nodemap->num;i++) {
1862 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1863 em->num_connected++;
1867 /* we shouldnt try to win this election if we cant be a recmaster */
1868 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1869 em->num_connected = 0;
1870 em->priority_time = timeval_current();
1873 talloc_free(nodemap);
1877 see if the given election data wins
1879 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1881 struct election_message myem;
1884 ctdb_election_data(rec, &myem);
1886 /* we cant win if we dont have the recmaster capability */
1887 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1891 /* we cant win if we are banned */
1892 if (rec->node_flags & NODE_FLAGS_BANNED) {
1896 /* we cant win if we are stopped */
1897 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1901 /* we will automatically win if the other node is banned */
1902 if (em->node_flags & NODE_FLAGS_BANNED) {
1906 /* we will automatically win if the other node is banned */
1907 if (em->node_flags & NODE_FLAGS_STOPPED) {
1911 /* try to use the most connected node */
1913 cmp = (int)myem.num_connected - (int)em->num_connected;
1916 /* then the longest running node */
1918 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1922 cmp = (int)myem.pnn - (int)em->pnn;
1929 send out an election request
1931 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1934 TDB_DATA election_data;
1935 struct election_message emsg;
1937 struct ctdb_context *ctdb = rec->ctdb;
1939 srvid = CTDB_SRVID_RECOVERY;
1941 ctdb_election_data(rec, &emsg);
1943 election_data.dsize = sizeof(struct election_message);
1944 election_data.dptr = (unsigned char *)&emsg;
1947 /* send an election message to all active nodes */
1948 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1949 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1952 /* A new node that is already frozen has entered the cluster.
1953 The existing nodes are not frozen and dont need to be frozen
1954 until the election has ended and we start the actual recovery
1956 if (update_recmaster == true) {
1957 /* first we assume we will win the election and set
1958 recoverymaster to be ourself on the current node
1960 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1962 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1972 this function will unban all nodes in the cluster
1974 static void unban_all_nodes(struct ctdb_context *ctdb)
1977 struct ctdb_node_map *nodemap;
1978 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1980 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1982 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1986 for (i=0;i<nodemap->num;i++) {
1987 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1988 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1989 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1993 talloc_free(tmp_ctx);
1998 we think we are winning the election - send a broadcast election request
2000 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2002 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2005 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2007 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2010 talloc_free(rec->send_election_te);
2011 rec->send_election_te = NULL;
2015 handler for memory dumps
2017 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2018 TDB_DATA data, void *private_data)
2020 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2023 struct rd_memdump_reply *rd;
2025 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2026 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2027 talloc_free(tmp_ctx);
2030 rd = (struct rd_memdump_reply *)data.dptr;
2032 dump = talloc_zero(tmp_ctx, TDB_DATA);
2034 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2035 talloc_free(tmp_ctx);
2038 ret = ctdb_dump_memory(ctdb, dump);
2040 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2041 talloc_free(tmp_ctx);
2045 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2047 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2049 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2050 talloc_free(tmp_ctx);
2054 talloc_free(tmp_ctx);
2058 handler for reload_nodes
2060 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2061 TDB_DATA data, void *private_data)
2063 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2065 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2067 reload_nodes_file(rec->ctdb);
2071 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2072 struct timeval yt, void *p)
2074 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2076 talloc_free(rec->ip_check_disable_ctx);
2077 rec->ip_check_disable_ctx = NULL;
2081 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2082 struct timeval t, void *p)
2084 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2085 struct ctdb_context *ctdb = rec->ctdb;
2088 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2090 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2092 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2093 rec->need_takeover_run = true;
2096 talloc_free(rec->deferred_rebalance_ctx);
2097 rec->deferred_rebalance_ctx = NULL;
2101 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2102 TDB_DATA data, void *private_data)
2105 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2107 if (data.dsize != sizeof(uint32_t)) {
2108 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2112 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2116 pnn = *(uint32_t *)&data.dptr[0];
2118 lcp2_forcerebalance(ctdb, pnn);
2119 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2121 if (rec->deferred_rebalance_ctx != NULL) {
2122 talloc_free(rec->deferred_rebalance_ctx);
2124 rec->deferred_rebalance_ctx = talloc_new(rec);
2125 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2126 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2127 ctdb_rebalance_timeout, rec);
2132 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2133 TDB_DATA data, void *private_data)
2135 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2136 struct ctdb_public_ip *ip;
2138 if (rec->recmaster != rec->ctdb->pnn) {
2139 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2143 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2144 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2148 ip = (struct ctdb_public_ip *)data.dptr;
2150 update_ip_assignment_tree(rec->ctdb, ip);
2154 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2155 TDB_DATA data, void *private_data)
2157 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2160 if (rec->ip_check_disable_ctx != NULL) {
2161 talloc_free(rec->ip_check_disable_ctx);
2162 rec->ip_check_disable_ctx = NULL;
2165 if (data.dsize != sizeof(uint32_t)) {
2166 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2167 "expexting %lu\n", (long unsigned)data.dsize,
2168 (long unsigned)sizeof(uint32_t)));
2171 if (data.dptr == NULL) {
2172 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2176 timeout = *((uint32_t *)data.dptr);
2179 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2183 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2185 rec->ip_check_disable_ctx = talloc_new(rec);
2186 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2188 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2193 handler for reload all ips.
2195 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2196 TDB_DATA data, void *private_data)
2198 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2200 if (data.dsize != sizeof(struct reloadips_all_reply)) {
2201 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2205 reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2207 DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2211 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2213 uint32_t *status = callback_data;
2216 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2222 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2224 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2229 DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2230 for (i = 0; i< nodemap->num; i++) {
2231 if (nodemap->nodes[i].flags != 0) {
2232 DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2233 talloc_free(tmp_ctx);
2238 /* send the flags update to all connected nodes */
2239 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2241 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2245 async_reloadips_callback, NULL,
2247 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2248 talloc_free(tmp_ctx);
2253 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2254 talloc_free(tmp_ctx);
2258 ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2260 talloc_free(tmp_ctx);
2266 handler for ip reallocate, just add it to the list of callers and
2267 handle this later in the monitor_cluster loop so we do not recurse
2268 with other callers to takeover_run()
2270 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2271 TDB_DATA data, void *private_data)
2273 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2274 struct ip_reallocate_list *caller;
2276 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2277 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2281 if (rec->ip_reallocate_ctx == NULL) {
2282 rec->ip_reallocate_ctx = talloc_new(rec);
2283 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2286 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2287 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2289 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2290 caller->next = rec->reallocate_callers;
2291 rec->reallocate_callers = caller;
2296 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2298 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2301 struct ip_reallocate_list *callers;
2304 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2306 /* update the list of public ips that a node can handle for
2309 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2311 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2313 rec->need_takeover_run = true;
2316 ret = ctdb_takeover_run(ctdb, rec->nodemap);
2318 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2319 rec->need_takeover_run = true;
2323 result.dsize = sizeof(int32_t);
2324 result.dptr = (uint8_t *)&ret;
2326 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2328 /* Someone that sent srvid==0 does not want a reply */
2329 if (callers->rd->srvid == 0) {
2332 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2333 "%u:%llu\n", (unsigned)callers->rd->pnn,
2334 (unsigned long long)callers->rd->srvid));
2335 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2337 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2338 "message to %u:%llu\n",
2339 (unsigned)callers->rd->pnn,
2340 (unsigned long long)callers->rd->srvid));
2344 talloc_free(tmp_ctx);
2345 talloc_free(rec->ip_reallocate_ctx);
2346 rec->ip_reallocate_ctx = NULL;
2347 rec->reallocate_callers = NULL;
2353 handler for recovery master elections
2355 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2356 TDB_DATA data, void *private_data)
2358 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2360 struct election_message *em = (struct election_message *)data.dptr;
2361 TALLOC_CTX *mem_ctx;
2363 /* we got an election packet - update the timeout for the election */
2364 talloc_free(rec->election_timeout);
2365 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2367 timeval_current_ofs(0, 500000) :
2368 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2369 ctdb_election_timeout, rec);
2371 mem_ctx = talloc_new(ctdb);
2373 /* someone called an election. check their election data
2374 and if we disagree and we would rather be the elected node,
2375 send a new election message to all other nodes
2377 if (ctdb_election_win(rec, em)) {
2378 if (!rec->send_election_te) {
2379 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2380 timeval_current_ofs(0, 500000),
2381 election_send_request, rec);
2383 talloc_free(mem_ctx);
2384 /*unban_all_nodes(ctdb);*/
2389 talloc_free(rec->send_election_te);
2390 rec->send_election_te = NULL;
2392 if (ctdb->tunable.verify_recovery_lock != 0) {
2393 /* release the recmaster lock */
2394 if (em->pnn != ctdb->pnn &&
2395 ctdb->recovery_lock_fd != -1) {
2396 close(ctdb->recovery_lock_fd);
2397 ctdb->recovery_lock_fd = -1;
2398 unban_all_nodes(ctdb);
2402 /* ok, let that guy become recmaster then */
2403 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2405 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2406 talloc_free(mem_ctx);
2410 talloc_free(mem_ctx);
2416 force the start of the election process
2418 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2419 struct ctdb_node_map *nodemap)
2422 struct ctdb_context *ctdb = rec->ctdb;
2424 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2426 /* set all nodes to recovery mode to stop all internode traffic */
2427 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2429 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2433 talloc_free(rec->election_timeout);
2434 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2436 timeval_current_ofs(0, 500000) :
2437 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2438 ctdb_election_timeout, rec);
2440 ret = send_election_request(rec, pnn, true);
2442 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2446 /* wait for a few seconds to collect all responses */
2447 ctdb_wait_election(rec);
2453 handler for when a node changes its flags
2455 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2456 TDB_DATA data, void *private_data)
2459 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2460 struct ctdb_node_map *nodemap=NULL;
2461 TALLOC_CTX *tmp_ctx;
2463 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2464 int disabled_flag_changed;
2466 if (data.dsize != sizeof(*c)) {
2467 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2471 tmp_ctx = talloc_new(ctdb);
2472 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2474 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2476 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2477 talloc_free(tmp_ctx);
2482 for (i=0;i<nodemap->num;i++) {
2483 if (nodemap->nodes[i].pnn == c->pnn) break;
2486 if (i == nodemap->num) {
2487 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2488 talloc_free(tmp_ctx);
2492 if (nodemap->nodes[i].flags != c->new_flags) {
2493 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, nodemap->nodes[i].flags));
2496 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2498 nodemap->nodes[i].flags = c->new_flags;
2500 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2501 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2504 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2505 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2509 ctdb->recovery_master == ctdb->pnn &&
2510 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2511 /* Only do the takeover run if the perm disabled or unhealthy
2512 flags changed since these will cause an ip failover but not
2514 If the node became disconnected or banned this will also
2515 lead to an ip address failover but that is handled
2518 if (disabled_flag_changed) {
2519 rec->need_takeover_run = true;
2523 talloc_free(tmp_ctx);
2527 handler for when we need to push out flag changes ot all other nodes
2529 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2530 TDB_DATA data, void *private_data)
2533 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2534 struct ctdb_node_map *nodemap=NULL;
2535 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2539 /* find the recovery master */
2540 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2542 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2543 talloc_free(tmp_ctx);
2547 /* read the node flags from the recmaster */
2548 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2550 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2551 talloc_free(tmp_ctx);
2554 if (c->pnn >= nodemap->num) {
2555 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2556 talloc_free(tmp_ctx);
2560 /* send the flags update to all connected nodes */
2561 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2563 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2564 nodes, 0, CONTROL_TIMEOUT(),
2568 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2570 talloc_free(tmp_ctx);
2574 talloc_free(tmp_ctx);
2578 struct verify_recmode_normal_data {
2580 enum monitor_result status;
2583 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2585 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2588 /* one more node has responded with recmode data*/
2591 /* if we failed to get the recmode, then return an error and let
2592 the main loop try again.
2594 if (state->state != CTDB_CONTROL_DONE) {
2595 if (rmdata->status == MONITOR_OK) {
2596 rmdata->status = MONITOR_FAILED;
2601 /* if we got a response, then the recmode will be stored in the
2604 if (state->status != CTDB_RECOVERY_NORMAL) {
2605 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2606 rmdata->status = MONITOR_RECOVERY_NEEDED;
2613 /* verify that all nodes are in normal recovery mode */
2614 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2616 struct verify_recmode_normal_data *rmdata;
2617 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2618 struct ctdb_client_control_state *state;
2619 enum monitor_result status;
2622 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2623 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2625 rmdata->status = MONITOR_OK;
2627 /* loop over all active nodes and send an async getrecmode call to
2629 for (j=0; j<nodemap->num; j++) {
2630 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2633 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2635 nodemap->nodes[j].pnn);
2636 if (state == NULL) {
2637 /* we failed to send the control, treat this as
2638 an error and try again next iteration
2640 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2641 talloc_free(mem_ctx);
2642 return MONITOR_FAILED;
2645 /* set up the callback functions */
2646 state->async.fn = verify_recmode_normal_callback;
2647 state->async.private_data = rmdata;
2649 /* one more control to wait for to complete */
2654 /* now wait for up to the maximum number of seconds allowed
2655 or until all nodes we expect a response from has replied
2657 while (rmdata->count > 0) {
2658 event_loop_once(ctdb->ev);
2661 status = rmdata->status;
2662 talloc_free(mem_ctx);
2667 struct verify_recmaster_data {
2668 struct ctdb_recoverd *rec;
2671 enum monitor_result status;
2674 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2676 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2679 /* one more node has responded with recmaster data*/
2682 /* if we failed to get the recmaster, then return an error and let
2683 the main loop try again.
2685 if (state->state != CTDB_CONTROL_DONE) {
2686 if (rmdata->status == MONITOR_OK) {
2687 rmdata->status = MONITOR_FAILED;
2692 /* if we got a response, then the recmaster will be stored in the
2695 if (state->status != rmdata->pnn) {
2696 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2697 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2698 rmdata->status = MONITOR_ELECTION_NEEDED;
2705 /* verify that all nodes agree that we are the recmaster */
2706 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2708 struct ctdb_context *ctdb = rec->ctdb;
2709 struct verify_recmaster_data *rmdata;
2710 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2711 struct ctdb_client_control_state *state;
2712 enum monitor_result status;
2715 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2716 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2720 rmdata->status = MONITOR_OK;
2722 /* loop over all active nodes and send an async getrecmaster call to
2724 for (j=0; j<nodemap->num; j++) {
2725 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2728 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2730 nodemap->nodes[j].pnn);
2731 if (state == NULL) {
2732 /* we failed to send the control, treat this as
2733 an error and try again next iteration
2735 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2736 talloc_free(mem_ctx);
2737 return MONITOR_FAILED;
2740 /* set up the callback functions */
2741 state->async.fn = verify_recmaster_callback;
2742 state->async.private_data = rmdata;
2744 /* one more control to wait for to complete */
2749 /* now wait for up to the maximum number of seconds allowed
2750 or until all nodes we expect a response from has replied
2752 while (rmdata->count > 0) {
2753 event_loop_once(ctdb->ev);
2756 status = rmdata->status;
2757 talloc_free(mem_ctx);
2762 /* called to check that the local allocation of public ip addresses is ok.
2764 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2766 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2767 struct ctdb_control_get_ifaces *ifaces = NULL;
2768 struct ctdb_all_public_ips *ips = NULL;
2769 struct ctdb_uptime *uptime1 = NULL;
2770 struct ctdb_uptime *uptime2 = NULL;
2772 bool need_iface_check = false;
2773 bool need_takeover_run = false;
2775 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2776 CTDB_CURRENT_NODE, &uptime1);
2778 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2779 talloc_free(mem_ctx);
2784 /* read the interfaces from the local node */
2785 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2787 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2788 talloc_free(mem_ctx);
2793 need_iface_check = true;
2794 } else if (rec->ifaces->num != ifaces->num) {
2795 need_iface_check = true;
2796 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2797 need_iface_check = true;
2800 talloc_free(rec->ifaces);
2801 rec->ifaces = talloc_steal(rec, ifaces);
2803 if (need_iface_check) {
2804 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2805 "local node %u - force takeover run\n",
2807 need_takeover_run = true;
2810 /* read the ip allocation from the local node */
2811 ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2813 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2814 talloc_free(mem_ctx);
2818 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2819 CTDB_CURRENT_NODE, &uptime2);
2821 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2822 talloc_free(mem_ctx);
2826 /* skip the check if the startrecovery time has changed */
2827 if (timeval_compare(&uptime1->last_recovery_started,
2828 &uptime2->last_recovery_started) != 0) {
2829 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2830 talloc_free(mem_ctx);
2834 /* skip the check if the endrecovery time has changed */
2835 if (timeval_compare(&uptime1->last_recovery_finished,
2836 &uptime2->last_recovery_finished) != 0) {
2837 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2838 talloc_free(mem_ctx);
2842 /* skip the check if we have started but not finished recovery */
2843 if (timeval_compare(&uptime1->last_recovery_finished,
2844 &uptime1->last_recovery_started) != 1) {
2845 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2846 talloc_free(mem_ctx);
2851 /* verify that we have the ip addresses we should have
2852 and we dont have ones we shouldnt have.
2853 if we find an inconsistency we set recmode to
2854 active on the local node and wait for the recmaster
2855 to do a full blown recovery.
2856 also if the pnn is -1 and we are healthy and can host the ip
2857 we also request a ip reallocation.
2859 if (ctdb->tunable.disable_ip_failover == 0) {
2860 for (j=0; j<ips->num; j++) {
2861 if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
2862 DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
2863 ctdb_addr_to_str(&ips->ips[j].addr)));
2864 need_takeover_run = true;
2865 } else if (ips->ips[j].pnn == pnn) {
2866 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
2867 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2868 ctdb_addr_to_str(&ips->ips[j].addr)));
2869 need_takeover_run = true;
2872 if (ctdb->do_checkpublicip && ctdb_sys_have_ip(&ips->ips[j].addr)) {
2874 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving. Removing it.\n",
2875 ctdb_addr_to_str(&ips->ips[j].addr)));
2877 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
2878 DEBUG(DEBUG_ERR,("Failed to release local ip address\n"));
2885 if (need_takeover_run) {
2886 struct takeover_run_reply rd;
2889 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2893 data.dptr = (uint8_t *)&rd;
2894 data.dsize = sizeof(rd);
2896 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2898 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2901 talloc_free(mem_ctx);
2906 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2908 struct ctdb_node_map **remote_nodemaps = callback_data;
2910 if (node_pnn >= ctdb->num_nodes) {
2911 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2915 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2919 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2920 struct ctdb_node_map *nodemap,
2921 struct ctdb_node_map **remote_nodemaps)
2925 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2926 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2928 CONTROL_TIMEOUT(), false, tdb_null,
2929 async_getnodemap_callback,
2931 remote_nodemaps) != 0) {
2932 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2940 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2941 struct ctdb_check_reclock_state {
2942 struct ctdb_context *ctdb;
2943 struct timeval start_time;
2946 struct timed_event *te;
2947 struct fd_event *fde;
2948 enum reclock_child_status status;
2951 /* when we free the reclock state we must kill any child process.
2953 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2955 struct ctdb_context *ctdb = state->ctdb;
2957 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2959 if (state->fd[0] != -1) {
2960 close(state->fd[0]);
2963 if (state->fd[1] != -1) {
2964 close(state->fd[1]);
2967 ctdb_kill(ctdb, state->child, SIGKILL);
2972 called if our check_reclock child times out. this would happen if
2973 i/o to the reclock file blocks.
2975 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2976 struct timeval t, void *private_data)
2978 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2979 struct ctdb_check_reclock_state);
2981 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2982 state->status = RECLOCK_TIMEOUT;
2985 /* this is called when the child process has completed checking the reclock
2986 file and has written data back to us through the pipe.
2988 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2989 uint16_t flags, void *private_data)
2991 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2992 struct ctdb_check_reclock_state);
2996 /* we got a response from our child process so we can abort the
2999 talloc_free(state->te);
3002 ret = read(state->fd[0], &c, 1);
3003 if (ret != 1 || c != RECLOCK_OK) {
3004 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3005 state->status = RECLOCK_FAILED;
3010 state->status = RECLOCK_OK;
3014 static int check_recovery_lock(struct ctdb_context *ctdb)
3017 struct ctdb_check_reclock_state *state;
3018 pid_t parent = getpid();
3020 if (ctdb->recovery_lock_fd == -1) {
3021 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3025 state = talloc(ctdb, struct ctdb_check_reclock_state);
3026 CTDB_NO_MEMORY(ctdb, state);
3029 state->start_time = timeval_current();
3030 state->status = RECLOCK_CHECKING;
3034 ret = pipe(state->fd);
3037 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3041 state->child = ctdb_fork(ctdb);
3042 if (state->child == (pid_t)-1) {
3043 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3044 close(state->fd[0]);
3046 close(state->fd[1]);
3052 if (state->child == 0) {
3053 char cc = RECLOCK_OK;
3054 close(state->fd[0]);
3057 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3058 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3059 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3060 cc = RECLOCK_FAILED;
3063 write(state->fd[1], &cc, 1);
3064 /* make sure we die when our parent dies */
3065 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3067 write(state->fd[1], &cc, 1);
3071 close(state->fd[1]);
3073 set_close_on_exec(state->fd[0]);
3075 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3077 talloc_set_destructor(state, check_reclock_destructor);
3079 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3080 ctdb_check_reclock_timeout, state);
3081 if (state->te == NULL) {
3082 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3087 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3089 reclock_child_handler,
3092 if (state->fde == NULL) {
3093 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3097 tevent_fd_set_auto_close(state->fde);
3099 while (state->status == RECLOCK_CHECKING) {
3100 event_loop_once(ctdb->ev);
3103 if (state->status == RECLOCK_FAILED) {
3104 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3105 close(ctdb->recovery_lock_fd);
3106 ctdb->recovery_lock_fd = -1;
3115 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3117 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3118 const char *reclockfile;
3120 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3121 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3122 talloc_free(tmp_ctx);
3126 if (reclockfile == NULL) {
3127 if (ctdb->recovery_lock_file != NULL) {
3128 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3129 talloc_free(ctdb->recovery_lock_file);
3130 ctdb->recovery_lock_file = NULL;
3131 if (ctdb->recovery_lock_fd != -1) {
3132 close(ctdb->recovery_lock_fd);
3133 ctdb->recovery_lock_fd = -1;
3136 ctdb->tunable.verify_recovery_lock = 0;
3137 talloc_free(tmp_ctx);
3141 if (ctdb->recovery_lock_file == NULL) {
3142 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3143 if (ctdb->recovery_lock_fd != -1) {
3144 close(ctdb->recovery_lock_fd);
3145 ctdb->recovery_lock_fd = -1;
3147 talloc_free(tmp_ctx);
3152 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3153 talloc_free(tmp_ctx);
3157 talloc_free(ctdb->recovery_lock_file);
3158 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3159 ctdb->tunable.verify_recovery_lock = 0;
3160 if (ctdb->recovery_lock_fd != -1) {
3161 close(ctdb->recovery_lock_fd);
3162 ctdb->recovery_lock_fd = -1;
3165 talloc_free(tmp_ctx);
3169 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3170 TALLOC_CTX *mem_ctx)
3173 struct ctdb_node_map *nodemap=NULL;
3174 struct ctdb_node_map *recmaster_nodemap=NULL;
3175 struct ctdb_node_map **remote_nodemaps=NULL;
3176 struct ctdb_vnn_map *vnnmap=NULL;
3177 struct ctdb_vnn_map *remote_vnnmap=NULL;
3178 int32_t debug_level;
3183 /* verify that the main daemon is still running */
3184 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3185 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3189 /* ping the local daemon to tell it we are alive */
3190 ctdb_ctrl_recd_ping(ctdb);
3192 if (rec->election_timeout) {
3193 /* an election is in progress */
3197 /* read the debug level from the parent and update locally */
3198 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3200 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3203 LogLevel = debug_level;
3206 /* We must check if we need to ban a node here but we want to do this
3207 as early as possible so we dont wait until we have pulled the node
3208 map from the local node. thats why we have the hardcoded value 20
3210 for (i=0; i<ctdb->num_nodes; i++) {
3211 struct ctdb_banning_state *ban_state;
3213 if (ctdb->nodes[i]->ban_state == NULL) {
3216 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3217 if (ban_state->count < 20) {
3220 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3221 ctdb->nodes[i]->pnn, ban_state->count,
3222 ctdb->tunable.recovery_ban_period));
3223 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3224 ban_state->count = 0;
3227 /* get relevant tunables */
3228 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3230 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3234 /* get the current recovery lock file from the server */
3235 if (update_recovery_lock_file(ctdb) != 0) {
3236 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3240 /* Make sure that if recovery lock verification becomes disabled when
3243 if (ctdb->tunable.verify_recovery_lock == 0) {
3244 if (ctdb->recovery_lock_fd != -1) {
3245 close(ctdb->recovery_lock_fd);
3246 ctdb->recovery_lock_fd = -1;
3250 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3251 if (pnn == (uint32_t)-1) {
3252 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3256 /* get the vnnmap */
3257 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3259 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3264 /* get number of nodes */
3266 talloc_free(rec->nodemap);
3267 rec->nodemap = NULL;
3270 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3272 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3275 nodemap = rec->nodemap;
3277 /* update the capabilities for all nodes */
3278 ret = update_capabilities(ctdb, nodemap);
3280 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3284 /* check which node is the recovery master */
3285 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3287 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3291 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3292 if (rec->recmaster != pnn) {
3293 if (rec->ip_reallocate_ctx != NULL) {
3294 talloc_free(rec->ip_reallocate_ctx);
3295 rec->ip_reallocate_ctx = NULL;
3296 rec->reallocate_callers = NULL;
3300 if (rec->recmaster == (uint32_t)-1) {
3301 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3302 force_election(rec, pnn, nodemap);
3306 /* if the local daemon is STOPPED, we verify that the databases are
3307 also frozen and thet the recmode is set to active
3309 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3310 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3312 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3314 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3315 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3317 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3319 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
3322 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3324 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
3331 /* If the local node is stopped, verify we are not the recmaster
3332 and yield this role if so
3334 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
3335 DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
3336 force_election(rec, pnn, nodemap);
3341 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3342 * but we have force an election and try to become the new
3345 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3346 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3347 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3348 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3349 " but we (node %u) have - force an election\n",
3350 rec->recmaster, pnn));
3351 force_election(rec, pnn, nodemap);
3355 /* check that we (recovery daemon) and the local ctdb daemon
3356 agrees on whether we are banned or not
3360 /* remember our own node flags */
3361 rec->node_flags = nodemap->nodes[pnn].flags;
3363 /* count how many active nodes there are */
3364 rec->num_active = 0;
3365 rec->num_connected = 0;
3366 for (i=0; i<nodemap->num; i++) {
3367 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3370 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3371 rec->num_connected++;
3376 /* verify that the recmaster node is still active */
3377 for (j=0; j<nodemap->num; j++) {
3378 if (nodemap->nodes[j].pnn==rec->recmaster) {
3383 if (j == nodemap->num) {
3384 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3385 force_election(rec, pnn, nodemap);
3389 /* if recovery master is disconnected we must elect a new recmaster */
3390 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3391 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3392 force_election(rec, pnn, nodemap);
3396 /* get nodemap from the recovery master to check if it is inactive */
3397 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3398 mem_ctx, &recmaster_nodemap);
3400 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3401 nodemap->nodes[j].pnn));
3406 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3407 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3408 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3409 force_election(rec, pnn, nodemap);
3413 /* If this node is stopped then it is not the recovery master
3414 * so the only remaining action is to potentially to verify
3415 * the local IP allocation below. This won't accomplish
3416 * anything useful so skip it.
3418 if (rec->node_flags & NODE_FLAGS_STOPPED) {
3422 /* verify that we have all ip addresses we should have and we dont
3423 * have addresses we shouldnt have.
3425 if (ctdb->tunable.disable_ip_failover == 0) {
3426 if (rec->ip_check_disable_ctx == NULL) {
3427 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3428 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3434 /* if we are not the recmaster then we do not need to check
3435 if recovery is needed
3437 if (pnn != rec->recmaster) {
3442 /* ensure our local copies of flags are right */
3443 ret = update_local_flags(rec, nodemap);
3444 if (ret == MONITOR_ELECTION_NEEDED) {
3445 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3446 force_election(rec, pnn, nodemap);
3449 if (ret != MONITOR_OK) {
3450 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3454 if (ctdb->num_nodes != nodemap->num) {
3455 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3456 reload_nodes_file(ctdb);
3460 /* verify that all active nodes agree that we are the recmaster */
3461 switch (verify_recmaster(rec, nodemap, pnn)) {
3462 case MONITOR_RECOVERY_NEEDED:
3463 /* can not happen */
3465 case MONITOR_ELECTION_NEEDED:
3466 force_election(rec, pnn, nodemap);
3470 case MONITOR_FAILED:
3475 if (rec->need_recovery) {
3476 /* a previous recovery didn't finish */
3477 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3481 /* verify that all active nodes are in normal mode
3482 and not in recovery mode
3484 switch (verify_recmode(ctdb, nodemap)) {
3485 case MONITOR_RECOVERY_NEEDED:
3486 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3488 case MONITOR_FAILED:
3490 case MONITOR_ELECTION_NEEDED:
3491 /* can not happen */
3497 if (ctdb->tunable.verify_recovery_lock != 0) {
3498 /* we should have the reclock - check its not stale */
3499 ret = check_recovery_lock(ctdb);
3501 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3502 ctdb_set_culprit(rec, ctdb->pnn);
3503 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3509 /* is there a pending reload all ips ? */
3510 if (reload_all_ips_request != NULL) {
3511 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3512 talloc_free(reload_all_ips_request);
3513 reload_all_ips_request = NULL;
3516 /* if there are takeovers requested, perform it and notify the waiters */
3517 if (rec->reallocate_callers) {
3518 process_ipreallocate_requests(ctdb, rec);
3521 /* get the nodemap for all active remote nodes
3523 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3524 if (remote_nodemaps == NULL) {
3525 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3528 for(i=0; i<nodemap->num; i++) {
3529 remote_nodemaps[i] = NULL;
3531 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3532 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3536 /* verify that all other nodes have the same nodemap as we have
3538 for (j=0; j<nodemap->num; j++) {
3539 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3543 if (remote_nodemaps[j] == NULL) {
3544 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3545 ctdb_set_culprit(rec, j);
3550 /* if the nodes disagree on how many nodes there are
3551 then this is a good reason to try recovery
3553 if (remote_nodemaps[j]->num != nodemap->num) {
3554 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3555 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3556 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3557 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3561 /* if the nodes disagree on which nodes exist and are
3562 active, then that is also a good reason to do recovery
3564 for (i=0;i<nodemap->num;i++) {
3565 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3566 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3567 nodemap->nodes[j].pnn, i,
3568 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3569 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3570 do_recovery(rec, mem_ctx, pnn, nodemap,
3576 /* verify the flags are consistent
3578 for (i=0; i<nodemap->num; i++) {
3579 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3583 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3584 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3585 nodemap->nodes[j].pnn,
3586 nodemap->nodes[i].pnn,
3587 remote_nodemaps[j]->nodes[i].flags,
3588 nodemap->nodes[j].flags));
3590 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3591 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3592 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3593 do_recovery(rec, mem_ctx, pnn, nodemap,
3597 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3598 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3599 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3600 do_recovery(rec, mem_ctx, pnn, nodemap,
3609 /* there better be the same number of lmasters in the vnn map
3610 as there are active nodes or we will have to do a recovery
3612 if (vnnmap->size != rec->num_active) {
3613 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3614 vnnmap->size, rec->num_active));
3615 ctdb_set_culprit(rec, ctdb->pnn);
3616 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3620 /* verify that all active nodes in the nodemap also exist in
3623 for (j=0; j<nodemap->num; j++) {
3624 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3627 if (nodemap->nodes[j].pnn == pnn) {
3631 for (i=0; i<vnnmap->size; i++) {
3632 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3636 if (i == vnnmap->size) {
3637 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3638 nodemap->nodes[j].pnn));
3639 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3640 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3646 /* verify that all other nodes have the same vnnmap
3647 and are from the same generation
3649 for (j=0; j<nodemap->num; j++) {
3650 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3653 if (nodemap->nodes[j].pnn == pnn) {
3657 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3658 mem_ctx, &remote_vnnmap);
3660 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3661 nodemap->nodes[j].pnn));
3665 /* verify the vnnmap generation is the same */
3666 if (vnnmap->generation != remote_vnnmap->generation) {
3667 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3668 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3669 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3670 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3674 /* verify the vnnmap size is the same */
3675 if (vnnmap->size != remote_vnnmap->size) {
3676 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3677 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3678 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3679 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3683 /* verify the vnnmap is the same */
3684 for (i=0;i<vnnmap->size;i++) {
3685 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3686 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3687 nodemap->nodes[j].pnn));
3688 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3689 do_recovery(rec, mem_ctx, pnn, nodemap,
3696 /* we might need to change who has what IP assigned */
3697 if (rec->need_takeover_run) {
3698 uint32_t culprit = (uint32_t)-1;
3700 rec->need_takeover_run = false;
3702 /* update the list of public ips that a node can handle for
3705 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3707 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3709 rec->need_takeover_run = true;
3713 /* execute the "startrecovery" event script on all nodes */
3714 ret = run_startrecovery_eventscript(rec, nodemap);
3716 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3717 ctdb_set_culprit(rec, ctdb->pnn);
3718 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3722 ret = ctdb_takeover_run(ctdb, nodemap);
3724 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
3728 /* execute the "recovered" event script on all nodes */
3729 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3731 // we cant check whether the event completed successfully
3732 // since this script WILL fail if the node is in recovery mode
3733 // and if that race happens, the code here would just cause a second
3734 // cascading recovery.
3736 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3737 ctdb_set_culprit(rec, ctdb->pnn);
3738 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3745 the main monitoring loop
3747 static void monitor_cluster(struct ctdb_context *ctdb)
3749 struct ctdb_recoverd *rec;
3751 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3753 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3754 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3758 rec->priority_time = timeval_current();
3760 /* register a message port for sending memory dumps */
3761 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3763 /* register a message port for recovery elections */
3764 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3766 /* when nodes are disabled/enabled */
3767 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3769 /* when we are asked to puch out a flag change */
3770 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3772 /* register a message port for vacuum fetch */
3773 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3775 /* register a message port for reloadnodes */
3776 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3778 /* register a message port for performing a takeover run */
3779 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3781 /* register a message port for performing a reload all ips */
3782 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
3784 /* register a message port for disabling the ip check for a short while */
3785 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3787 /* register a message port for updating the recovery daemons node assignment for an ip */
3788 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3790 /* register a message port for forcing a rebalance of a node next
3792 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3795 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3796 struct timeval start;
3800 DEBUG(DEBUG_CRIT,(__location__
3801 " Failed to create temp context\n"));
3805 start = timeval_current();
3806 main_loop(ctdb, rec, mem_ctx);
3807 talloc_free(mem_ctx);
3809 /* we only check for recovery once every second */
3810 elapsed = timeval_elapsed(&start);
3811 if (elapsed < ctdb->tunable.recover_interval) {
3812 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3819 event handler for when the main ctdbd dies
3821 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3822 uint16_t flags, void *private_data)
3824 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3829 called regularly to verify that the recovery daemon is still running
3831 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3832 struct timeval yt, void *p)
3834 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3836 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3837 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3839 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3840 ctdb_restart_recd, ctdb);
3845 event_add_timed(ctdb->ev, ctdb,
3846 timeval_current_ofs(30, 0),
3847 ctdb_check_recd, ctdb);
3850 static void recd_sig_child_handler(struct event_context *ev,
3851 struct signal_event *se, int signum, int count,
3855 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3860 pid = waitpid(-1, &status, WNOHANG);
3862 if (errno != ECHILD) {
3863 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3868 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3874 startup the recovery daemon as a child of the main ctdb daemon
3876 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3879 struct signal_event *se;
3880 struct tevent_fd *fde;
3882 if (pipe(fd) != 0) {
3886 ctdb->ctdbd_pid = getpid();
3888 ctdb->recoverd_pid = ctdb_fork(ctdb);
3889 if (ctdb->recoverd_pid == -1) {
3893 if (ctdb->recoverd_pid != 0) {
3895 event_add_timed(ctdb->ev, ctdb,
3896 timeval_current_ofs(30, 0),
3897 ctdb_check_recd, ctdb);
3903 srandom(getpid() ^ time(NULL));
3905 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
3906 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3910 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3912 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
3913 ctdb_recoverd_parent, &fd[0]);
3914 tevent_fd_set_auto_close(fde);
3916 /* set up a handler to pick up sigchld */
3917 se = event_add_signal(ctdb->ev, ctdb,
3919 recd_sig_child_handler,
3922 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3926 monitor_cluster(ctdb);
3928 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3933 shutdown the recovery daemon
3935 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3937 if (ctdb->recoverd_pid == 0) {
3941 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3942 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3945 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
3946 struct timeval t, void *private_data)
3948 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3950 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3951 ctdb_stop_recoverd(ctdb);
3952 ctdb_start_recoverd(ctdb);