4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* most recent reload all ips request we need to perform during the
36 struct reloadips_all_reply *reload_all_ips_request = NULL;
38 /* list of "ctdb ipreallocate" processes to call back when we have
39 finished the takeover run.
41 struct ip_reallocate_list {
42 struct ip_reallocate_list *next;
43 struct rd_memdump_reply *rd;
46 struct ctdb_banning_state {
48 struct timeval last_reported_time;
52 private state of recovery daemon
54 struct ctdb_recoverd {
55 struct ctdb_context *ctdb;
58 uint32_t num_connected;
59 uint32_t last_culprit_node;
60 struct ctdb_node_map *nodemap;
61 struct timeval priority_time;
62 bool need_takeover_run;
65 struct timed_event *send_election_te;
66 struct timed_event *election_timeout;
67 struct vacuum_info *vacuum_info;
68 TALLOC_CTX *ip_reallocate_ctx;
69 struct ip_reallocate_list *reallocate_callers;
70 TALLOC_CTX *ip_check_disable_ctx;
71 struct ctdb_control_get_ifaces *ifaces;
72 TALLOC_CTX *deferred_rebalance_ctx;
75 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
76 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
78 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
81 ban a node for a period of time
83 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
86 struct ctdb_context *ctdb = rec->ctdb;
87 struct ctdb_ban_time bantime;
89 if (!ctdb_validate_pnn(ctdb, pnn)) {
90 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
94 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
97 bantime.time = ban_time;
99 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
101 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
107 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
111 remember the trouble maker
113 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
115 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
116 struct ctdb_banning_state *ban_state;
118 if (culprit > ctdb->num_nodes) {
119 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
123 /* If we are banned or stopped, do not set other nodes as culprits */
124 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
125 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
129 if (ctdb->nodes[culprit]->ban_state == NULL) {
130 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
131 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
135 ban_state = ctdb->nodes[culprit]->ban_state;
136 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
137 /* this was the first time in a long while this node
138 misbehaved so we will forgive any old transgressions.
140 ban_state->count = 0;
143 ban_state->count += count;
144 ban_state->last_reported_time = timeval_current();
145 rec->last_culprit_node = culprit;
149 remember the trouble maker
151 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
153 ctdb_set_culprit_count(rec, culprit, 1);
157 /* this callback is called for every node that failed to execute the
160 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
162 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
164 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
166 ctdb_set_culprit(rec, node_pnn);
170 run the "recovered" eventscript on all nodes
172 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
176 struct ctdb_context *ctdb = rec->ctdb;
178 tmp_ctx = talloc_new(ctdb);
179 CTDB_NO_MEMORY(ctdb, tmp_ctx);
181 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
182 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
184 CONTROL_TIMEOUT(), false, tdb_null,
185 NULL, recovered_fail_callback,
187 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
189 talloc_free(tmp_ctx);
193 talloc_free(tmp_ctx);
197 /* this callback is called for every node that failed to execute the
200 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
202 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
204 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
206 ctdb_set_culprit(rec, node_pnn);
210 run the "startrecovery" eventscript on all nodes
212 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
216 struct ctdb_context *ctdb = rec->ctdb;
218 tmp_ctx = talloc_new(ctdb);
219 CTDB_NO_MEMORY(ctdb, tmp_ctx);
221 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
222 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
224 CONTROL_TIMEOUT(), false, tdb_null,
226 startrecovery_fail_callback,
228 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
229 talloc_free(tmp_ctx);
233 talloc_free(tmp_ctx);
237 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
239 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
240 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
243 if (node_pnn < ctdb->num_nodes) {
244 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
247 if (node_pnn == ctdb->pnn) {
248 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
253 update the node capabilities for all connected nodes
255 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
260 tmp_ctx = talloc_new(ctdb);
261 CTDB_NO_MEMORY(ctdb, tmp_ctx);
263 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
264 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
268 async_getcap_callback, NULL,
270 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
271 talloc_free(tmp_ctx);
275 talloc_free(tmp_ctx);
279 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
281 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
283 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
284 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
287 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
289 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
291 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
292 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
296 change recovery mode on all nodes
298 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
304 tmp_ctx = talloc_new(ctdb);
305 CTDB_NO_MEMORY(ctdb, tmp_ctx);
307 /* freeze all nodes */
308 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
309 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
312 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
313 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
318 set_recmode_fail_callback,
320 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
321 talloc_free(tmp_ctx);
328 data.dsize = sizeof(uint32_t);
329 data.dptr = (unsigned char *)&rec_mode;
331 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
337 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
338 talloc_free(tmp_ctx);
342 talloc_free(tmp_ctx);
347 change recovery master on all node
349 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
355 tmp_ctx = talloc_new(ctdb);
356 CTDB_NO_MEMORY(ctdb, tmp_ctx);
358 data.dsize = sizeof(uint32_t);
359 data.dptr = (unsigned char *)&pnn;
361 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
362 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
364 CONTROL_TIMEOUT(), false, data,
367 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
368 talloc_free(tmp_ctx);
372 talloc_free(tmp_ctx);
376 /* update all remote nodes to use the same db priority that we have
377 this can fail if the remove node has not yet been upgraded to
378 support this function, so we always return success and never fail
379 a recovery if this call fails.
381 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
382 struct ctdb_node_map *nodemap,
383 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
388 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
390 /* step through all local databases */
391 for (db=0; db<dbmap->num;db++) {
393 struct ctdb_db_priority db_prio;
396 db_prio.db_id = dbmap->dbs[db].dbid;
397 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
399 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
403 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
405 data.dptr = (uint8_t *)&db_prio;
406 data.dsize = sizeof(db_prio);
408 if (ctdb_client_async_control(ctdb,
409 CTDB_CONTROL_SET_DB_PRIORITY,
411 CONTROL_TIMEOUT(), false, data,
414 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
422 ensure all other nodes have attached to any databases that we have
424 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
425 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
428 struct ctdb_dbid_map *remote_dbmap;
430 /* verify that all other nodes have all our databases */
431 for (j=0; j<nodemap->num; j++) {
432 /* we dont need to ourself ourselves */
433 if (nodemap->nodes[j].pnn == pnn) {
436 /* dont check nodes that are unavailable */
437 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
441 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
442 mem_ctx, &remote_dbmap);
444 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
448 /* step through all local databases */
449 for (db=0; db<dbmap->num;db++) {
453 for (i=0;i<remote_dbmap->num;i++) {
454 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
458 /* the remote node already have this database */
459 if (i!=remote_dbmap->num) {
462 /* ok so we need to create this database */
463 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
466 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
469 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
471 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
473 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
484 ensure we are attached to any databases that anyone else is attached to
486 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
487 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
490 struct ctdb_dbid_map *remote_dbmap;
492 /* verify that we have all database any other node has */
493 for (j=0; j<nodemap->num; j++) {
494 /* we dont need to ourself ourselves */
495 if (nodemap->nodes[j].pnn == pnn) {
498 /* dont check nodes that are unavailable */
499 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
503 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
504 mem_ctx, &remote_dbmap);
506 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
510 /* step through all databases on the remote node */
511 for (db=0; db<remote_dbmap->num;db++) {
514 for (i=0;i<(*dbmap)->num;i++) {
515 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
519 /* we already have this db locally */
520 if (i!=(*dbmap)->num) {
523 /* ok so we need to create this database and
526 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
527 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
529 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
530 nodemap->nodes[j].pnn));
533 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
534 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
536 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
539 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
541 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
552 pull the remote database contents from one node into the recdb
554 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
555 struct tdb_wrap *recdb, uint32_t dbid)
559 struct ctdb_marshall_buffer *reply;
560 struct ctdb_rec_data *rec;
562 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
564 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
565 CONTROL_TIMEOUT(), &outdata);
567 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
568 talloc_free(tmp_ctx);
572 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
574 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
575 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
576 talloc_free(tmp_ctx);
580 rec = (struct ctdb_rec_data *)&reply->data[0];
584 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
586 struct ctdb_ltdb_header *hdr;
589 key.dptr = &rec->data[0];
590 key.dsize = rec->keylen;
591 data.dptr = &rec->data[key.dsize];
592 data.dsize = rec->datalen;
594 hdr = (struct ctdb_ltdb_header *)data.dptr;
596 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
597 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
598 talloc_free(tmp_ctx);
602 /* fetch the existing record, if any */
603 existing = tdb_fetch(recdb->tdb, key);
605 if (existing.dptr != NULL) {
606 struct ctdb_ltdb_header header;
607 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
608 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
609 (unsigned)existing.dsize, srcnode));
611 talloc_free(tmp_ctx);
614 header = *(struct ctdb_ltdb_header *)existing.dptr;
616 if (!(header.rsn < hdr->rsn ||
617 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
622 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
623 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
624 talloc_free(tmp_ctx);
629 talloc_free(tmp_ctx);
635 struct pull_seqnum_cbdata {
641 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
643 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
646 if (cb_data->failed != 0) {
647 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
652 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
657 if (outdata.dsize != sizeof(uint64_t)) {
658 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
659 cb_data->failed = -1;
663 seqnum = *((uint64_t *)outdata.dptr);
665 if (seqnum > cb_data->seqnum) {
666 cb_data->seqnum = seqnum;
667 cb_data->pnn = node_pnn;
671 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
673 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
675 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
679 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
680 struct ctdb_recoverd *rec,
681 struct ctdb_node_map *nodemap,
682 struct tdb_wrap *recdb, uint32_t dbid)
684 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
688 struct pull_seqnum_cbdata *cb_data;
690 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
695 data.dsize = sizeof(outdata);
696 data.dptr = (uint8_t *)&outdata[0];
698 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
699 if (cb_data == NULL) {
700 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
701 talloc_free(tmp_ctx);
709 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
710 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
712 CONTROL_TIMEOUT(), false, data,
716 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
718 talloc_free(tmp_ctx);
722 if (cb_data->failed != 0) {
723 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
724 talloc_free(tmp_ctx);
728 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
729 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
730 talloc_free(tmp_ctx);
734 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
736 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
737 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
738 talloc_free(tmp_ctx);
742 talloc_free(tmp_ctx);
748 pull all the remote database contents into the recdb
750 static int pull_remote_database(struct ctdb_context *ctdb,
751 struct ctdb_recoverd *rec,
752 struct ctdb_node_map *nodemap,
753 struct tdb_wrap *recdb, uint32_t dbid,
758 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
760 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
766 /* pull all records from all other nodes across onto this node
767 (this merges based on rsn)
769 for (j=0; j<nodemap->num; j++) {
770 /* dont merge from nodes that are unavailable */
771 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
774 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
775 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
776 nodemap->nodes[j].pnn));
777 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
787 update flags on all active nodes
789 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
793 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
795 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
803 ensure all nodes have the same vnnmap we do
805 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
806 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
810 /* push the new vnn map out to all the nodes */
811 for (j=0; j<nodemap->num; j++) {
812 /* dont push to nodes that are unavailable */
813 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
817 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
819 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
829 struct vacuum_info *next, *prev;
830 struct ctdb_recoverd *rec;
832 struct ctdb_db_context *ctdb_db;
833 struct ctdb_marshall_buffer *recs;
834 struct ctdb_rec_data *r;
837 static void vacuum_fetch_next(struct vacuum_info *v);
840 called when a vacuum fetch has completed - just free it and do the next one
842 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
844 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
846 vacuum_fetch_next(v);
851 process the next element from the vacuum list
853 static void vacuum_fetch_next(struct vacuum_info *v)
855 struct ctdb_call call;
856 struct ctdb_rec_data *r;
858 while (v->recs->count) {
859 struct ctdb_client_call_state *state;
861 struct ctdb_ltdb_header *hdr;
864 call.call_id = CTDB_NULL_FUNC;
865 call.flags = CTDB_IMMEDIATE_MIGRATION;
866 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
869 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
872 call.key.dptr = &r->data[0];
873 call.key.dsize = r->keylen;
875 /* ensure we don't block this daemon - just skip a record if we can't get
877 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
881 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
882 if (data.dptr == NULL) {
883 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
887 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
889 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
893 hdr = (struct ctdb_ltdb_header *)data.dptr;
894 if (hdr->dmaster == v->rec->ctdb->pnn) {
895 /* its already local */
897 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
903 state = ctdb_call_send(v->ctdb_db, &call);
904 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
906 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
910 state->async.fn = vacuum_fetch_callback;
911 state->async.private_data = v;
920 destroy a vacuum info structure
922 static int vacuum_info_destructor(struct vacuum_info *v)
924 DLIST_REMOVE(v->rec->vacuum_info, v);
930 handler for vacuum fetch
932 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
933 TDB_DATA data, void *private_data)
935 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
936 struct ctdb_marshall_buffer *recs;
938 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
940 struct ctdb_dbid_map *dbmap=NULL;
941 bool persistent = false;
942 struct ctdb_db_context *ctdb_db;
943 struct ctdb_rec_data *r;
945 struct vacuum_info *v;
947 recs = (struct ctdb_marshall_buffer *)data.dptr;
948 r = (struct ctdb_rec_data *)&recs->data[0];
950 if (recs->count == 0) {
951 talloc_free(tmp_ctx);
957 for (v=rec->vacuum_info;v;v=v->next) {
958 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
959 /* we're already working on records from this node */
960 talloc_free(tmp_ctx);
965 /* work out if the database is persistent */
966 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
968 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
969 talloc_free(tmp_ctx);
973 for (i=0;i<dbmap->num;i++) {
974 if (dbmap->dbs[i].dbid == recs->db_id) {
975 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
979 if (i == dbmap->num) {
980 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
981 talloc_free(tmp_ctx);
985 /* find the name of this database */
986 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
987 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
988 talloc_free(tmp_ctx);
993 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
994 if (ctdb_db == NULL) {
995 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
996 talloc_free(tmp_ctx);
1000 v = talloc_zero(rec, struct vacuum_info);
1002 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1003 talloc_free(tmp_ctx);
1008 v->srcnode = srcnode;
1009 v->ctdb_db = ctdb_db;
1010 v->recs = talloc_memdup(v, recs, data.dsize);
1011 if (v->recs == NULL) {
1012 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1014 talloc_free(tmp_ctx);
1017 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1019 DLIST_ADD(rec->vacuum_info, v);
1021 talloc_set_destructor(v, vacuum_info_destructor);
1023 vacuum_fetch_next(v);
1024 talloc_free(tmp_ctx);
1029 called when ctdb_wait_timeout should finish
1031 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1032 struct timeval yt, void *p)
1034 uint32_t *timed_out = (uint32_t *)p;
1039 wait for a given number of seconds
1041 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1043 uint32_t timed_out = 0;
1044 time_t usecs = (secs - (time_t)secs) * 1000000;
1045 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1046 while (!timed_out) {
1047 event_loop_once(ctdb->ev);
1052 called when an election times out (ends)
1054 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1055 struct timeval t, void *p)
1057 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1058 rec->election_timeout = NULL;
1061 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1066 wait for an election to finish. It finished election_timeout seconds after
1067 the last election packet is received
1069 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1071 struct ctdb_context *ctdb = rec->ctdb;
1072 while (rec->election_timeout) {
1073 event_loop_once(ctdb->ev);
1078 Update our local flags from all remote connected nodes.
1079 This is only run when we are or we belive we are the recovery master
1081 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1084 struct ctdb_context *ctdb = rec->ctdb;
1085 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1087 /* get the nodemap for all active remote nodes and verify
1088 they are the same as for this node
1090 for (j=0; j<nodemap->num; j++) {
1091 struct ctdb_node_map *remote_nodemap=NULL;
1094 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1097 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1101 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1102 mem_ctx, &remote_nodemap);
1104 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1105 nodemap->nodes[j].pnn));
1106 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1107 talloc_free(mem_ctx);
1108 return MONITOR_FAILED;
1110 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1111 /* We should tell our daemon about this so it
1112 updates its flags or else we will log the same
1113 message again in the next iteration of recovery.
1114 Since we are the recovery master we can just as
1115 well update the flags on all nodes.
1117 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1119 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1123 /* Update our local copy of the flags in the recovery
1126 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1127 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1128 nodemap->nodes[j].flags));
1129 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1131 talloc_free(remote_nodemap);
1133 talloc_free(mem_ctx);
1138 /* Create a new random generation ip.
1139 The generation id can not be the INVALID_GENERATION id
1141 static uint32_t new_generation(void)
1143 uint32_t generation;
1146 generation = random();
1148 if (generation != INVALID_GENERATION) {
1158 create a temporary working database
1160 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1163 struct tdb_wrap *recdb;
1166 /* open up the temporary recovery database */
1167 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1168 ctdb->db_directory_state,
1175 tdb_flags = TDB_NOLOCK;
1176 if (ctdb->valgrinding) {
1177 tdb_flags |= TDB_NOMMAP;
1179 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1181 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1182 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1183 if (recdb == NULL) {
1184 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1194 a traverse function for pulling all relevant records from recdb
1197 struct ctdb_context *ctdb;
1198 struct ctdb_marshall_buffer *recdata;
1200 uint32_t allocated_len;
1205 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1207 struct recdb_data *params = (struct recdb_data *)p;
1208 struct ctdb_rec_data *rec;
1209 struct ctdb_ltdb_header *hdr;
1212 * skip empty records - but NOT for persistent databases:
1214 * The record-by-record mode of recovery deletes empty records.
1215 * For persistent databases, this can lead to data corruption
1216 * by deleting records that should be there:
1218 * - Assume the cluster has been running for a while.
1220 * - A record R in a persistent database has been created and
1221 * deleted a couple of times, the last operation being deletion,
1222 * leaving an empty record with a high RSN, say 10.
1224 * - Now a node N is turned off.
1226 * - This leaves the local database copy of D on N with the empty
1227 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1228 * the copy of record R.
1230 * - Now the record is created again while node N is turned off.
1231 * This creates R with RSN = 1 on all nodes except for N.
1233 * - Now node N is turned on again. The following recovery will chose
1234 * the older empty copy of R due to RSN 10 > RSN 1.
1236 * ==> Hence the record is gone after the recovery.
1238 * On databases like Samba's registry, this can damage the higher-level
1239 * data structures built from the various tdb-level records.
1241 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1245 /* update the dmaster field to point to us */
1246 hdr = (struct ctdb_ltdb_header *)data.dptr;
1247 if (!params->persistent) {
1248 hdr->dmaster = params->ctdb->pnn;
1249 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1252 /* add the record to the blob ready to send to the nodes */
1253 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1255 params->failed = true;
1258 if (params->len + rec->length >= params->allocated_len) {
1259 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1260 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1262 if (params->recdata == NULL) {
1263 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1264 rec->length + params->len));
1265 params->failed = true;
1268 params->recdata->count++;
1269 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1270 params->len += rec->length;
1277 push the recdb database out to all nodes
1279 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1281 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1283 struct recdb_data params;
1284 struct ctdb_marshall_buffer *recdata;
1286 TALLOC_CTX *tmp_ctx;
1289 tmp_ctx = talloc_new(ctdb);
1290 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1292 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1293 CTDB_NO_MEMORY(ctdb, recdata);
1295 recdata->db_id = dbid;
1298 params.recdata = recdata;
1299 params.len = offsetof(struct ctdb_marshall_buffer, data);
1300 params.allocated_len = params.len;
1301 params.failed = false;
1302 params.persistent = persistent;
1304 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1305 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1306 talloc_free(params.recdata);
1307 talloc_free(tmp_ctx);
1311 if (params.failed) {
1312 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1313 talloc_free(params.recdata);
1314 talloc_free(tmp_ctx);
1318 recdata = params.recdata;
1320 outdata.dptr = (void *)recdata;
1321 outdata.dsize = params.len;
1323 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1324 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1326 CONTROL_TIMEOUT(), false, outdata,
1329 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1330 talloc_free(recdata);
1331 talloc_free(tmp_ctx);
1335 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1336 dbid, recdata->count));
1338 talloc_free(recdata);
1339 talloc_free(tmp_ctx);
1346 go through a full recovery on one database
1348 static int recover_database(struct ctdb_recoverd *rec,
1349 TALLOC_CTX *mem_ctx,
1353 struct ctdb_node_map *nodemap,
1354 uint32_t transaction_id)
1356 struct tdb_wrap *recdb;
1358 struct ctdb_context *ctdb = rec->ctdb;
1360 struct ctdb_control_wipe_database w;
1363 recdb = create_recdb(ctdb, mem_ctx);
1364 if (recdb == NULL) {
1368 /* pull all remote databases onto the recdb */
1369 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1371 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1375 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1377 /* wipe all the remote databases. This is safe as we are in a transaction */
1379 w.transaction_id = transaction_id;
1381 data.dptr = (void *)&w;
1382 data.dsize = sizeof(w);
1384 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1385 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1387 CONTROL_TIMEOUT(), false, data,
1390 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1395 /* push out the correct database. This sets the dmaster and skips
1396 the empty records */
1397 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1403 /* all done with this database */
1410 reload the nodes file
1412 static void reload_nodes_file(struct ctdb_context *ctdb)
1415 ctdb_load_nodes_file(ctdb);
1418 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1419 struct ctdb_recoverd *rec,
1420 struct ctdb_node_map *nodemap,
1426 if (ctdb->num_nodes != nodemap->num) {
1427 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1428 ctdb->num_nodes, nodemap->num));
1430 *culprit = ctdb->pnn;
1435 for (j=0; j<nodemap->num; j++) {
1436 /* For readability */
1437 struct ctdb_node *node = ctdb->nodes[j];
1439 /* release any existing data */
1440 if (node->known_public_ips) {
1441 talloc_free(node->known_public_ips);
1442 node->known_public_ips = NULL;
1444 if (node->available_public_ips) {
1445 talloc_free(node->available_public_ips);
1446 node->available_public_ips = NULL;
1449 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1453 /* Retrieve the list of known public IPs from the node */
1454 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1459 &node->known_public_ips);
1462 ("Failed to read known public IPs from node: %u\n",
1465 *culprit = node->pnn;
1470 if (ctdb->do_checkpublicip &&
1471 (rec->ip_check_disable_ctx == NULL) &&
1472 verify_remote_ip_allocation(ctdb,
1473 node->known_public_ips,
1475 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1476 rec->need_takeover_run = true;
1479 /* Retrieve the list of available public IPs from the node */
1480 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1484 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1485 &node->available_public_ips);
1488 ("Failed to read available public IPs from node: %u\n",
1491 *culprit = node->pnn;
1500 /* when we start a recovery, make sure all nodes use the same reclock file
1503 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1505 struct ctdb_context *ctdb = rec->ctdb;
1506 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1510 if (ctdb->recovery_lock_file == NULL) {
1514 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1515 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1518 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1519 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1525 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1526 talloc_free(tmp_ctx);
1530 talloc_free(tmp_ctx);
1536 * this callback is called for every node that failed to execute ctdb_takeover_run()
1537 * and set flag to re-run takeover run.
1539 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1541 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1543 if (callback_data != NULL) {
1544 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1546 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1548 ctdb_set_culprit(rec, node_pnn);
1549 rec->need_takeover_run = true;
1554 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1556 struct ctdb_context *ctdb = rec->ctdb;
1558 struct ctdb_banning_state *ban_state;
1561 for (i=0; i<ctdb->num_nodes; i++) {
1562 if (ctdb->nodes[i]->ban_state == NULL) {
1565 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1566 if (ban_state->count < 2*ctdb->num_nodes) {
1570 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1571 ctdb->nodes[i]->pnn, ban_state->count,
1572 ctdb->tunable.recovery_ban_period));
1573 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1574 ban_state->count = 0;
1576 /* Banning ourself? */
1577 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1585 we are the recmaster, and recovery is needed - start a recovery run
1587 static int do_recovery(struct ctdb_recoverd *rec,
1588 TALLOC_CTX *mem_ctx, uint32_t pnn,
1589 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1591 struct ctdb_context *ctdb = rec->ctdb;
1593 uint32_t generation;
1594 struct ctdb_dbid_map *dbmap;
1597 struct timeval start_time;
1598 uint32_t culprit = (uint32_t)-1;
1601 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1603 /* if recovery fails, force it again */
1604 rec->need_recovery = true;
1606 ban_misbehaving_nodes(rec, &self_ban);
1608 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1612 if (ctdb->tunable.verify_recovery_lock != 0) {
1613 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1614 start_time = timeval_current();
1615 if (!ctdb_recovery_lock(ctdb, true)) {
1616 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1617 "and ban ourself for %u seconds\n",
1618 ctdb->tunable.recovery_ban_period));
1619 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1622 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1623 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1626 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1628 /* get a list of all databases */
1629 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1631 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1635 /* we do the db creation before we set the recovery mode, so the freeze happens
1636 on all databases we will be dealing with. */
1638 /* verify that we have all the databases any other node has */
1639 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1641 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1645 /* verify that all other nodes have all our databases */
1646 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1648 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1651 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1653 /* update the database priority for all remote databases */
1654 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1656 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1658 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1661 /* update all other nodes to use the same setting for reclock files
1662 as the local recovery master.
1664 sync_recovery_lock_file_across_cluster(rec);
1666 /* set recovery mode to active on all nodes */
1667 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1669 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1673 /* execute the "startrecovery" event script on all nodes */
1674 ret = run_startrecovery_eventscript(rec, nodemap);
1676 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1681 update all nodes to have the same flags that we have
1683 for (i=0;i<nodemap->num;i++) {
1684 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1688 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1690 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1695 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1697 /* pick a new generation number */
1698 generation = new_generation();
1700 /* change the vnnmap on this node to use the new generation
1701 number but not on any other nodes.
1702 this guarantees that if we abort the recovery prematurely
1703 for some reason (a node stops responding?)
1704 that we can just return immediately and we will reenter
1705 recovery shortly again.
1706 I.e. we deliberately leave the cluster with an inconsistent
1707 generation id to allow us to abort recovery at any stage and
1708 just restart it from scratch.
1710 vnnmap->generation = generation;
1711 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1713 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1717 data.dptr = (void *)&generation;
1718 data.dsize = sizeof(uint32_t);
1720 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1721 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1723 CONTROL_TIMEOUT(), false, data,
1725 transaction_start_fail_callback,
1727 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1728 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1730 CONTROL_TIMEOUT(), false, tdb_null,
1734 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1739 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1741 for (i=0;i<dbmap->num;i++) {
1742 ret = recover_database(rec, mem_ctx,
1744 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1745 pnn, nodemap, generation);
1747 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1752 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1754 /* commit all the changes */
1755 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1757 CONTROL_TIMEOUT(), false, data,
1760 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1764 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1767 /* update the capabilities for all nodes */
1768 ret = update_capabilities(ctdb, nodemap);
1770 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1774 /* build a new vnn map with all the currently active and
1776 generation = new_generation();
1777 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1778 CTDB_NO_MEMORY(ctdb, vnnmap);
1779 vnnmap->generation = generation;
1781 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1782 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1783 for (i=j=0;i<nodemap->num;i++) {
1784 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1787 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1788 /* this node can not be an lmaster */
1789 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1794 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1795 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1796 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1799 if (vnnmap->size == 0) {
1800 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1802 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1803 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1804 vnnmap->map[0] = pnn;
1807 /* update to the new vnnmap on all nodes */
1808 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1810 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1814 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1816 /* update recmaster to point to us for all nodes */
1817 ret = set_recovery_master(ctdb, nodemap, pnn);
1819 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1823 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1826 update all nodes to have the same flags that we have
1828 for (i=0;i<nodemap->num;i++) {
1829 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1833 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1835 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1840 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1842 /* disable recovery mode */
1843 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1845 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1849 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1851 /* Fetch known/available public IPs from each active node */
1852 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1854 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1856 rec->need_takeover_run = true;
1859 rec->need_takeover_run = false;
1860 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, NULL);
1862 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1863 rec->need_takeover_run = true;
1866 /* execute the "recovered" event script on all nodes */
1867 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1869 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1873 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1875 /* send a message to all clients telling them that the cluster
1876 has been reconfigured */
1877 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1879 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1881 rec->need_recovery = false;
1883 /* we managed to complete a full recovery, make sure to forgive
1884 any past sins by the nodes that could now participate in the
1887 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1888 for (i=0;i<nodemap->num;i++) {
1889 struct ctdb_banning_state *ban_state;
1891 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1895 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1896 if (ban_state == NULL) {
1900 ban_state->count = 0;
1904 /* We just finished a recovery successfully.
1905 We now wait for rerecovery_timeout before we allow
1906 another recovery to take place.
1908 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1909 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1910 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1917 elections are won by first checking the number of connected nodes, then
1918 the priority time, then the pnn
1920 struct election_message {
1921 uint32_t num_connected;
1922 struct timeval priority_time;
1924 uint32_t node_flags;
1928 form this nodes election data
1930 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1933 struct ctdb_node_map *nodemap;
1934 struct ctdb_context *ctdb = rec->ctdb;
1938 em->pnn = rec->ctdb->pnn;
1939 em->priority_time = rec->priority_time;
1941 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1943 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1947 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1948 em->node_flags = rec->node_flags;
1950 for (i=0;i<nodemap->num;i++) {
1951 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1952 em->num_connected++;
1956 /* we shouldnt try to win this election if we cant be a recmaster */
1957 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1958 em->num_connected = 0;
1959 em->priority_time = timeval_current();
1962 talloc_free(nodemap);
1966 see if the given election data wins
1968 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1970 struct election_message myem;
1973 ctdb_election_data(rec, &myem);
1975 /* we cant win if we dont have the recmaster capability */
1976 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1980 /* we cant win if we are banned */
1981 if (rec->node_flags & NODE_FLAGS_BANNED) {
1985 /* we cant win if we are stopped */
1986 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1990 /* we will automatically win if the other node is banned */
1991 if (em->node_flags & NODE_FLAGS_BANNED) {
1995 /* we will automatically win if the other node is banned */
1996 if (em->node_flags & NODE_FLAGS_STOPPED) {
2000 /* try to use the most connected node */
2002 cmp = (int)myem.num_connected - (int)em->num_connected;
2005 /* then the longest running node */
2007 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2011 cmp = (int)myem.pnn - (int)em->pnn;
2018 send out an election request
2020 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
2023 TDB_DATA election_data;
2024 struct election_message emsg;
2026 struct ctdb_context *ctdb = rec->ctdb;
2028 srvid = CTDB_SRVID_RECOVERY;
2030 ctdb_election_data(rec, &emsg);
2032 election_data.dsize = sizeof(struct election_message);
2033 election_data.dptr = (unsigned char *)&emsg;
2036 /* send an election message to all active nodes */
2037 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2038 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2041 /* A new node that is already frozen has entered the cluster.
2042 The existing nodes are not frozen and dont need to be frozen
2043 until the election has ended and we start the actual recovery
2045 if (update_recmaster == true) {
2046 /* first we assume we will win the election and set
2047 recoverymaster to be ourself on the current node
2049 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2051 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2061 this function will unban all nodes in the cluster
2063 static void unban_all_nodes(struct ctdb_context *ctdb)
2066 struct ctdb_node_map *nodemap;
2067 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2069 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2071 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2075 for (i=0;i<nodemap->num;i++) {
2076 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2077 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2078 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2082 talloc_free(tmp_ctx);
2087 we think we are winning the election - send a broadcast election request
2089 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2091 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2094 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2096 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2099 talloc_free(rec->send_election_te);
2100 rec->send_election_te = NULL;
2104 handler for memory dumps
2106 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2107 TDB_DATA data, void *private_data)
2109 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2112 struct rd_memdump_reply *rd;
2114 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2115 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2116 talloc_free(tmp_ctx);
2119 rd = (struct rd_memdump_reply *)data.dptr;
2121 dump = talloc_zero(tmp_ctx, TDB_DATA);
2123 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2124 talloc_free(tmp_ctx);
2127 ret = ctdb_dump_memory(ctdb, dump);
2129 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2130 talloc_free(tmp_ctx);
2134 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2136 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2138 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2139 talloc_free(tmp_ctx);
2143 talloc_free(tmp_ctx);
2149 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2150 TDB_DATA data, void *private_data)
2152 struct ctdb_get_log_addr *log_addr;
2155 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2156 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2159 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2161 child = ctdb_fork_no_free_ringbuffer(ctdb);
2162 if (child == (pid_t)-1) {
2163 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2168 ctdb_set_process_name("ctdb_rec_log_collector");
2169 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2170 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2173 ctdb_collect_log(ctdb, log_addr);
2179 handler for clearlog
2181 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2182 TDB_DATA data, void *private_data)
2184 ctdb_clear_log(ctdb);
2188 handler for reload_nodes
2190 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2191 TDB_DATA data, void *private_data)
2193 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2195 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2197 reload_nodes_file(rec->ctdb);
2201 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2202 struct timeval yt, void *p)
2204 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2206 talloc_free(rec->ip_check_disable_ctx);
2207 rec->ip_check_disable_ctx = NULL;
2211 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2212 struct timeval t, void *p)
2214 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2215 struct ctdb_context *ctdb = rec->ctdb;
2218 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2220 ret = ctdb_takeover_run(ctdb, rec->nodemap, takeover_fail_callback, NULL);
2222 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2223 rec->need_takeover_run = true;
2226 talloc_free(rec->deferred_rebalance_ctx);
2227 rec->deferred_rebalance_ctx = NULL;
2231 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2232 TDB_DATA data, void *private_data)
2235 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2237 if (data.dsize != sizeof(uint32_t)) {
2238 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2242 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2246 pnn = *(uint32_t *)&data.dptr[0];
2248 lcp2_forcerebalance(ctdb, pnn);
2249 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2251 if (rec->deferred_rebalance_ctx != NULL) {
2252 talloc_free(rec->deferred_rebalance_ctx);
2254 rec->deferred_rebalance_ctx = talloc_new(rec);
2255 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2256 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2257 ctdb_rebalance_timeout, rec);
2262 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2263 TDB_DATA data, void *private_data)
2265 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2266 struct ctdb_public_ip *ip;
2268 if (rec->recmaster != rec->ctdb->pnn) {
2269 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2273 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2274 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2278 ip = (struct ctdb_public_ip *)data.dptr;
2280 update_ip_assignment_tree(rec->ctdb, ip);
2284 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2285 TDB_DATA data, void *private_data)
2287 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2290 if (rec->ip_check_disable_ctx != NULL) {
2291 talloc_free(rec->ip_check_disable_ctx);
2292 rec->ip_check_disable_ctx = NULL;
2295 if (data.dsize != sizeof(uint32_t)) {
2296 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2297 "expexting %lu\n", (long unsigned)data.dsize,
2298 (long unsigned)sizeof(uint32_t)));
2301 if (data.dptr == NULL) {
2302 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2306 timeout = *((uint32_t *)data.dptr);
2309 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2313 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2315 rec->ip_check_disable_ctx = talloc_new(rec);
2316 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2318 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2323 handler for reload all ips.
2325 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2326 TDB_DATA data, void *private_data)
2328 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2330 if (data.dsize != sizeof(struct reloadips_all_reply)) {
2331 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2335 reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2337 DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2341 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2343 uint32_t *status = callback_data;
2346 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2352 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2354 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2359 DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2360 for (i = 0; i< nodemap->num; i++) {
2361 if (nodemap->nodes[i].flags != 0) {
2362 DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2363 talloc_free(tmp_ctx);
2368 /* send the flags update to all connected nodes */
2369 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2371 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2375 async_reloadips_callback, NULL,
2377 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2378 talloc_free(tmp_ctx);
2383 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2384 talloc_free(tmp_ctx);
2388 ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2390 talloc_free(tmp_ctx);
2396 handler for ip reallocate, just add it to the list of callers and
2397 handle this later in the monitor_cluster loop so we do not recurse
2398 with other callers to takeover_run()
2400 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2401 TDB_DATA data, void *private_data)
2403 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2404 struct ip_reallocate_list *caller;
2406 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2407 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2411 if (rec->ip_reallocate_ctx == NULL) {
2412 rec->ip_reallocate_ctx = talloc_new(rec);
2413 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2416 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2417 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2419 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2420 caller->next = rec->reallocate_callers;
2421 rec->reallocate_callers = caller;
2426 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2430 struct ip_reallocate_list *callers;
2433 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2435 /* update the list of public ips that a node can handle for
2438 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2440 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2442 rec->need_takeover_run = true;
2445 ret = ctdb_takeover_run(ctdb, rec->nodemap, takeover_fail_callback, NULL);
2447 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2448 rec->need_takeover_run = true;
2452 result.dsize = sizeof(int32_t);
2453 result.dptr = (uint8_t *)&ret;
2455 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2457 /* Someone that sent srvid==0 does not want a reply */
2458 if (callers->rd->srvid == 0) {
2461 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2462 "%u:%llu\n", (unsigned)callers->rd->pnn,
2463 (unsigned long long)callers->rd->srvid));
2464 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2466 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2467 "message to %u:%llu\n",
2468 (unsigned)callers->rd->pnn,
2469 (unsigned long long)callers->rd->srvid));
2473 talloc_free(rec->ip_reallocate_ctx);
2474 rec->ip_reallocate_ctx = NULL;
2475 rec->reallocate_callers = NULL;
2480 handler for recovery master elections
2482 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2483 TDB_DATA data, void *private_data)
2485 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2487 struct election_message *em = (struct election_message *)data.dptr;
2488 TALLOC_CTX *mem_ctx;
2490 /* we got an election packet - update the timeout for the election */
2491 talloc_free(rec->election_timeout);
2492 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2494 timeval_current_ofs(0, 500000) :
2495 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2496 ctdb_election_timeout, rec);
2498 mem_ctx = talloc_new(ctdb);
2500 /* someone called an election. check their election data
2501 and if we disagree and we would rather be the elected node,
2502 send a new election message to all other nodes
2504 if (ctdb_election_win(rec, em)) {
2505 if (!rec->send_election_te) {
2506 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2507 timeval_current_ofs(0, 500000),
2508 election_send_request, rec);
2510 talloc_free(mem_ctx);
2511 /*unban_all_nodes(ctdb);*/
2516 talloc_free(rec->send_election_te);
2517 rec->send_election_te = NULL;
2519 if (ctdb->tunable.verify_recovery_lock != 0) {
2520 /* release the recmaster lock */
2521 if (em->pnn != ctdb->pnn &&
2522 ctdb->recovery_lock_fd != -1) {
2523 close(ctdb->recovery_lock_fd);
2524 ctdb->recovery_lock_fd = -1;
2525 unban_all_nodes(ctdb);
2529 /* ok, let that guy become recmaster then */
2530 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2532 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2533 talloc_free(mem_ctx);
2537 talloc_free(mem_ctx);
2543 force the start of the election process
2545 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2546 struct ctdb_node_map *nodemap)
2549 struct ctdb_context *ctdb = rec->ctdb;
2551 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2553 /* set all nodes to recovery mode to stop all internode traffic */
2554 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2556 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2560 talloc_free(rec->election_timeout);
2561 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2563 timeval_current_ofs(0, 500000) :
2564 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2565 ctdb_election_timeout, rec);
2567 ret = send_election_request(rec, pnn, true);
2569 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2573 /* wait for a few seconds to collect all responses */
2574 ctdb_wait_election(rec);
2580 handler for when a node changes its flags
2582 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2583 TDB_DATA data, void *private_data)
2586 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2587 struct ctdb_node_map *nodemap=NULL;
2588 TALLOC_CTX *tmp_ctx;
2590 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2591 int disabled_flag_changed;
2593 if (data.dsize != sizeof(*c)) {
2594 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2598 tmp_ctx = talloc_new(ctdb);
2599 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2601 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2603 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2604 talloc_free(tmp_ctx);
2609 for (i=0;i<nodemap->num;i++) {
2610 if (nodemap->nodes[i].pnn == c->pnn) break;
2613 if (i == nodemap->num) {
2614 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2615 talloc_free(tmp_ctx);
2619 if (c->old_flags != c->new_flags) {
2620 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2623 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2625 nodemap->nodes[i].flags = c->new_flags;
2627 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2628 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2631 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2632 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2636 ctdb->recovery_master == ctdb->pnn &&
2637 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2638 /* Only do the takeover run if the perm disabled or unhealthy
2639 flags changed since these will cause an ip failover but not
2641 If the node became disconnected or banned this will also
2642 lead to an ip address failover but that is handled
2645 if (disabled_flag_changed) {
2646 rec->need_takeover_run = true;
2650 talloc_free(tmp_ctx);
2654 handler for when we need to push out flag changes ot all other nodes
2656 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2657 TDB_DATA data, void *private_data)
2660 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2661 struct ctdb_node_map *nodemap=NULL;
2662 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2666 /* find the recovery master */
2667 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2669 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2670 talloc_free(tmp_ctx);
2674 /* read the node flags from the recmaster */
2675 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2677 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2678 talloc_free(tmp_ctx);
2681 if (c->pnn >= nodemap->num) {
2682 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2683 talloc_free(tmp_ctx);
2687 /* send the flags update to all connected nodes */
2688 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2690 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2691 nodes, 0, CONTROL_TIMEOUT(),
2695 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2697 talloc_free(tmp_ctx);
2701 talloc_free(tmp_ctx);
2705 struct verify_recmode_normal_data {
2707 enum monitor_result status;
2710 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2712 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2715 /* one more node has responded with recmode data*/
2718 /* if we failed to get the recmode, then return an error and let
2719 the main loop try again.
2721 if (state->state != CTDB_CONTROL_DONE) {
2722 if (rmdata->status == MONITOR_OK) {
2723 rmdata->status = MONITOR_FAILED;
2728 /* if we got a response, then the recmode will be stored in the
2731 if (state->status != CTDB_RECOVERY_NORMAL) {
2732 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2733 rmdata->status = MONITOR_RECOVERY_NEEDED;
2740 /* verify that all nodes are in normal recovery mode */
2741 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2743 struct verify_recmode_normal_data *rmdata;
2744 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2745 struct ctdb_client_control_state *state;
2746 enum monitor_result status;
2749 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2750 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2752 rmdata->status = MONITOR_OK;
2754 /* loop over all active nodes and send an async getrecmode call to
2756 for (j=0; j<nodemap->num; j++) {
2757 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2760 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2762 nodemap->nodes[j].pnn);
2763 if (state == NULL) {
2764 /* we failed to send the control, treat this as
2765 an error and try again next iteration
2767 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2768 talloc_free(mem_ctx);
2769 return MONITOR_FAILED;
2772 /* set up the callback functions */
2773 state->async.fn = verify_recmode_normal_callback;
2774 state->async.private_data = rmdata;
2776 /* one more control to wait for to complete */
2781 /* now wait for up to the maximum number of seconds allowed
2782 or until all nodes we expect a response from has replied
2784 while (rmdata->count > 0) {
2785 event_loop_once(ctdb->ev);
2788 status = rmdata->status;
2789 talloc_free(mem_ctx);
2794 struct verify_recmaster_data {
2795 struct ctdb_recoverd *rec;
2798 enum monitor_result status;
2801 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2803 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2806 /* one more node has responded with recmaster data*/
2809 /* if we failed to get the recmaster, then return an error and let
2810 the main loop try again.
2812 if (state->state != CTDB_CONTROL_DONE) {
2813 if (rmdata->status == MONITOR_OK) {
2814 rmdata->status = MONITOR_FAILED;
2819 /* if we got a response, then the recmaster will be stored in the
2822 if (state->status != rmdata->pnn) {
2823 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2824 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2825 rmdata->status = MONITOR_ELECTION_NEEDED;
2832 /* verify that all nodes agree that we are the recmaster */
2833 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2835 struct ctdb_context *ctdb = rec->ctdb;
2836 struct verify_recmaster_data *rmdata;
2837 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2838 struct ctdb_client_control_state *state;
2839 enum monitor_result status;
2842 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2843 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2847 rmdata->status = MONITOR_OK;
2849 /* loop over all active nodes and send an async getrecmaster call to
2851 for (j=0; j<nodemap->num; j++) {
2852 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2855 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2857 nodemap->nodes[j].pnn);
2858 if (state == NULL) {
2859 /* we failed to send the control, treat this as
2860 an error and try again next iteration
2862 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2863 talloc_free(mem_ctx);
2864 return MONITOR_FAILED;
2867 /* set up the callback functions */
2868 state->async.fn = verify_recmaster_callback;
2869 state->async.private_data = rmdata;
2871 /* one more control to wait for to complete */
2876 /* now wait for up to the maximum number of seconds allowed
2877 or until all nodes we expect a response from has replied
2879 while (rmdata->count > 0) {
2880 event_loop_once(ctdb->ev);
2883 status = rmdata->status;
2884 talloc_free(mem_ctx);
2888 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2889 struct ctdb_recoverd *rec)
2891 struct ctdb_control_get_ifaces *ifaces = NULL;
2892 TALLOC_CTX *mem_ctx;
2895 mem_ctx = talloc_new(NULL);
2897 /* Read the interfaces from the local node */
2898 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2899 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2900 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2901 /* We could return an error. However, this will be
2902 * rare so we'll decide that the interfaces have
2903 * actually changed, just in case.
2905 talloc_free(mem_ctx);
2910 /* We haven't been here before so things have changed */
2911 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2913 } else if (rec->ifaces->num != ifaces->num) {
2914 /* Number of interfaces has changed */
2915 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2916 rec->ifaces->num, ifaces->num));
2919 /* See if interface names or link states have changed */
2921 for (i = 0; i < rec->ifaces->num; i++) {
2922 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
2923 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2925 ("Interface in slot %d changed: %s => %s\n",
2926 i, iface->name, ifaces->ifaces[i].name));
2930 if (iface->link_state != ifaces->ifaces[i].link_state) {
2932 ("Interface %s changed state: %d => %d\n",
2933 iface->name, iface->link_state,
2934 ifaces->ifaces[i].link_state));
2941 talloc_free(rec->ifaces);
2942 rec->ifaces = talloc_steal(rec, ifaces);
2944 talloc_free(mem_ctx);
2948 /* called to check that the local allocation of public ip addresses is ok.
2950 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2952 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2953 struct ctdb_uptime *uptime1 = NULL;
2954 struct ctdb_uptime *uptime2 = NULL;
2956 bool need_takeover_run = false;
2958 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2959 CTDB_CURRENT_NODE, &uptime1);
2961 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2962 talloc_free(mem_ctx);
2966 if (interfaces_have_changed(ctdb, rec)) {
2967 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2968 "local node %u - force takeover run\n",
2970 need_takeover_run = true;
2973 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2974 CTDB_CURRENT_NODE, &uptime2);
2976 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2977 talloc_free(mem_ctx);
2981 /* skip the check if the startrecovery time has changed */
2982 if (timeval_compare(&uptime1->last_recovery_started,
2983 &uptime2->last_recovery_started) != 0) {
2984 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2985 talloc_free(mem_ctx);
2989 /* skip the check if the endrecovery time has changed */
2990 if (timeval_compare(&uptime1->last_recovery_finished,
2991 &uptime2->last_recovery_finished) != 0) {
2992 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2993 talloc_free(mem_ctx);
2997 /* skip the check if we have started but not finished recovery */
2998 if (timeval_compare(&uptime1->last_recovery_finished,
2999 &uptime1->last_recovery_started) != 1) {
3000 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3001 talloc_free(mem_ctx);
3006 /* verify that we have the ip addresses we should have
3007 and we dont have ones we shouldnt have.
3008 if we find an inconsistency we set recmode to
3009 active on the local node and wait for the recmaster
3010 to do a full blown recovery.
3011 also if the pnn is -1 and we are healthy and can host the ip
3012 we also request a ip reallocation.
3014 if (ctdb->tunable.disable_ip_failover == 0) {
3015 struct ctdb_all_public_ips *ips = NULL;
3017 /* read the *available* IPs from the local node */
3018 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3020 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3021 talloc_free(mem_ctx);
3025 for (j=0; j<ips->num; j++) {
3026 if (ips->ips[j].pnn == -1 &&
3027 nodemap->nodes[pnn].flags == 0) {
3028 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3029 ctdb_addr_to_str(&ips->ips[j].addr)));
3030 need_takeover_run = true;
3036 /* read the *known* IPs from the local node */
3037 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3039 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3040 talloc_free(mem_ctx);
3044 for (j=0; j<ips->num; j++) {
3045 if (ips->ips[j].pnn == pnn) {
3046 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3047 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3048 ctdb_addr_to_str(&ips->ips[j].addr)));
3049 need_takeover_run = true;
3052 if (ctdb->do_checkpublicip &&
3053 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3055 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3056 ctdb_addr_to_str(&ips->ips[j].addr)));
3058 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3059 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3066 if (need_takeover_run) {
3067 struct takeover_run_reply rd;
3070 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3074 data.dptr = (uint8_t *)&rd;
3075 data.dsize = sizeof(rd);
3077 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3079 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3082 talloc_free(mem_ctx);
3087 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3089 struct ctdb_node_map **remote_nodemaps = callback_data;
3091 if (node_pnn >= ctdb->num_nodes) {
3092 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3096 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3100 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3101 struct ctdb_node_map *nodemap,
3102 struct ctdb_node_map **remote_nodemaps)
3106 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3107 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3109 CONTROL_TIMEOUT(), false, tdb_null,
3110 async_getnodemap_callback,
3112 remote_nodemaps) != 0) {
3113 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3121 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3122 struct ctdb_check_reclock_state {
3123 struct ctdb_context *ctdb;
3124 struct timeval start_time;
3127 struct timed_event *te;
3128 struct fd_event *fde;
3129 enum reclock_child_status status;
3132 /* when we free the reclock state we must kill any child process.
3134 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3136 struct ctdb_context *ctdb = state->ctdb;
3138 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3140 if (state->fd[0] != -1) {
3141 close(state->fd[0]);
3144 if (state->fd[1] != -1) {
3145 close(state->fd[1]);
3148 ctdb_kill(ctdb, state->child, SIGKILL);
3153 called if our check_reclock child times out. this would happen if
3154 i/o to the reclock file blocks.
3156 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3157 struct timeval t, void *private_data)
3159 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3160 struct ctdb_check_reclock_state);
3162 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3163 state->status = RECLOCK_TIMEOUT;
3166 /* this is called when the child process has completed checking the reclock
3167 file and has written data back to us through the pipe.
3169 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3170 uint16_t flags, void *private_data)
3172 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3173 struct ctdb_check_reclock_state);
3177 /* we got a response from our child process so we can abort the
3180 talloc_free(state->te);
3183 ret = read(state->fd[0], &c, 1);
3184 if (ret != 1 || c != RECLOCK_OK) {
3185 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3186 state->status = RECLOCK_FAILED;
3191 state->status = RECLOCK_OK;
3195 static int check_recovery_lock(struct ctdb_context *ctdb)
3198 struct ctdb_check_reclock_state *state;
3199 pid_t parent = getpid();
3201 if (ctdb->recovery_lock_fd == -1) {
3202 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3206 state = talloc(ctdb, struct ctdb_check_reclock_state);
3207 CTDB_NO_MEMORY(ctdb, state);
3210 state->start_time = timeval_current();
3211 state->status = RECLOCK_CHECKING;
3215 ret = pipe(state->fd);
3218 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3222 state->child = ctdb_fork(ctdb);
3223 if (state->child == (pid_t)-1) {
3224 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3225 close(state->fd[0]);
3227 close(state->fd[1]);
3233 if (state->child == 0) {
3234 char cc = RECLOCK_OK;
3235 close(state->fd[0]);
3238 ctdb_set_process_name("ctdb_rec_reclock");
3239 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3240 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3241 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3242 cc = RECLOCK_FAILED;
3245 write(state->fd[1], &cc, 1);
3246 /* make sure we die when our parent dies */
3247 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3252 close(state->fd[1]);
3254 set_close_on_exec(state->fd[0]);
3256 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3258 talloc_set_destructor(state, check_reclock_destructor);
3260 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3261 ctdb_check_reclock_timeout, state);
3262 if (state->te == NULL) {
3263 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3268 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3270 reclock_child_handler,
3273 if (state->fde == NULL) {
3274 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3278 tevent_fd_set_auto_close(state->fde);
3280 while (state->status == RECLOCK_CHECKING) {
3281 event_loop_once(ctdb->ev);
3284 if (state->status == RECLOCK_FAILED) {
3285 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3286 close(ctdb->recovery_lock_fd);
3287 ctdb->recovery_lock_fd = -1;
3296 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3298 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3299 const char *reclockfile;
3301 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3302 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3303 talloc_free(tmp_ctx);
3307 if (reclockfile == NULL) {
3308 if (ctdb->recovery_lock_file != NULL) {
3309 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3310 talloc_free(ctdb->recovery_lock_file);
3311 ctdb->recovery_lock_file = NULL;
3312 if (ctdb->recovery_lock_fd != -1) {
3313 close(ctdb->recovery_lock_fd);
3314 ctdb->recovery_lock_fd = -1;
3317 ctdb->tunable.verify_recovery_lock = 0;
3318 talloc_free(tmp_ctx);
3322 if (ctdb->recovery_lock_file == NULL) {
3323 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3324 if (ctdb->recovery_lock_fd != -1) {
3325 close(ctdb->recovery_lock_fd);
3326 ctdb->recovery_lock_fd = -1;
3328 talloc_free(tmp_ctx);
3333 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3334 talloc_free(tmp_ctx);
3338 talloc_free(ctdb->recovery_lock_file);
3339 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3340 ctdb->tunable.verify_recovery_lock = 0;
3341 if (ctdb->recovery_lock_fd != -1) {
3342 close(ctdb->recovery_lock_fd);
3343 ctdb->recovery_lock_fd = -1;
3346 talloc_free(tmp_ctx);
3350 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3351 TALLOC_CTX *mem_ctx)
3354 struct ctdb_node_map *nodemap=NULL;
3355 struct ctdb_node_map *recmaster_nodemap=NULL;
3356 struct ctdb_node_map **remote_nodemaps=NULL;
3357 struct ctdb_vnn_map *vnnmap=NULL;
3358 struct ctdb_vnn_map *remote_vnnmap=NULL;
3359 int32_t debug_level;
3364 /* verify that the main daemon is still running */
3365 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3366 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3370 /* ping the local daemon to tell it we are alive */
3371 ctdb_ctrl_recd_ping(ctdb);
3373 if (rec->election_timeout) {
3374 /* an election is in progress */
3378 /* read the debug level from the parent and update locally */
3379 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3381 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3384 LogLevel = debug_level;
3386 /* get relevant tunables */
3387 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3389 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3393 /* get the current recovery lock file from the server */
3394 if (update_recovery_lock_file(ctdb) != 0) {
3395 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3399 /* Make sure that if recovery lock verification becomes disabled when
3402 if (ctdb->tunable.verify_recovery_lock == 0) {
3403 if (ctdb->recovery_lock_fd != -1) {
3404 close(ctdb->recovery_lock_fd);
3405 ctdb->recovery_lock_fd = -1;
3409 pnn = ctdb_get_pnn(ctdb);
3411 /* get the vnnmap */
3412 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3414 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3419 /* get number of nodes */
3421 talloc_free(rec->nodemap);
3422 rec->nodemap = NULL;
3425 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3427 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3430 nodemap = rec->nodemap;
3432 /* remember our own node flags */
3433 rec->node_flags = nodemap->nodes[pnn].flags;
3435 ban_misbehaving_nodes(rec, &self_ban);
3437 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3441 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3442 also frozen and that the recmode is set to active.
3444 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3445 /* If this node has become inactive then we want to
3446 * reduce the chances of it taking over the recovery
3447 * master role when it becomes active again. This
3448 * helps to stabilise the recovery master role so that
3449 * it stays on the most stable node.
3451 rec->priority_time = timeval_current();
3453 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3455 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3457 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3458 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3460 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3462 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3465 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3467 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3473 /* If this node is stopped or banned then it is not the recovery
3474 * master, so don't do anything. This prevents stopped or banned
3475 * node from starting election and sending unnecessary controls.
3480 /* check which node is the recovery master */
3481 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3483 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3487 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3488 if (rec->recmaster != pnn) {
3489 if (rec->ip_reallocate_ctx != NULL) {
3490 talloc_free(rec->ip_reallocate_ctx);
3491 rec->ip_reallocate_ctx = NULL;
3492 rec->reallocate_callers = NULL;
3496 /* This is a special case. When recovery daemon is started, recmaster
3497 * is set to -1. If a node is not started in stopped state, then
3498 * start election to decide recovery master
3500 if (rec->recmaster == (uint32_t)-1) {
3501 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3502 force_election(rec, pnn, nodemap);
3506 /* update the capabilities for all nodes */
3507 ret = update_capabilities(ctdb, nodemap);
3509 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3514 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3515 * but we have, then force an election and try to become the new
3518 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3519 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3520 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3521 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3522 " but we (node %u) have - force an election\n",
3523 rec->recmaster, pnn));
3524 force_election(rec, pnn, nodemap);
3528 /* count how many active nodes there are */
3529 rec->num_active = 0;
3530 rec->num_connected = 0;
3531 for (i=0; i<nodemap->num; i++) {
3532 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3535 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3536 rec->num_connected++;
3541 /* verify that the recmaster node is still active */
3542 for (j=0; j<nodemap->num; j++) {
3543 if (nodemap->nodes[j].pnn==rec->recmaster) {
3548 if (j == nodemap->num) {
3549 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3550 force_election(rec, pnn, nodemap);
3554 /* if recovery master is disconnected we must elect a new recmaster */
3555 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3556 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3557 force_election(rec, pnn, nodemap);
3561 /* get nodemap from the recovery master to check if it is inactive */
3562 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3563 mem_ctx, &recmaster_nodemap);
3565 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3566 nodemap->nodes[j].pnn));
3571 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3572 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3573 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3575 * update our nodemap to carry the recmaster's notion of
3576 * its own flags, so that we don't keep freezing the
3577 * inactive recmaster node...
3579 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3580 force_election(rec, pnn, nodemap);
3584 /* verify that we have all ip addresses we should have and we dont
3585 * have addresses we shouldnt have.
3587 if (ctdb->tunable.disable_ip_failover == 0) {
3588 if (rec->ip_check_disable_ctx == NULL) {
3589 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3590 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3596 /* if we are not the recmaster then we do not need to check
3597 if recovery is needed
3599 if (pnn != rec->recmaster) {
3604 /* ensure our local copies of flags are right */
3605 ret = update_local_flags(rec, nodemap);
3606 if (ret == MONITOR_ELECTION_NEEDED) {
3607 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3608 force_election(rec, pnn, nodemap);
3611 if (ret != MONITOR_OK) {
3612 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3616 if (ctdb->num_nodes != nodemap->num) {
3617 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3618 reload_nodes_file(ctdb);
3622 /* verify that all active nodes agree that we are the recmaster */
3623 switch (verify_recmaster(rec, nodemap, pnn)) {
3624 case MONITOR_RECOVERY_NEEDED:
3625 /* can not happen */
3627 case MONITOR_ELECTION_NEEDED:
3628 force_election(rec, pnn, nodemap);
3632 case MONITOR_FAILED:
3637 if (rec->need_recovery) {
3638 /* a previous recovery didn't finish */
3639 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3643 /* verify that all active nodes are in normal mode
3644 and not in recovery mode
3646 switch (verify_recmode(ctdb, nodemap)) {
3647 case MONITOR_RECOVERY_NEEDED:
3648 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3650 case MONITOR_FAILED:
3652 case MONITOR_ELECTION_NEEDED:
3653 /* can not happen */
3659 if (ctdb->tunable.verify_recovery_lock != 0) {
3660 /* we should have the reclock - check its not stale */
3661 ret = check_recovery_lock(ctdb);
3663 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3664 ctdb_set_culprit(rec, ctdb->pnn);
3665 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3671 /* is there a pending reload all ips ? */
3672 if (reload_all_ips_request != NULL) {
3673 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3674 talloc_free(reload_all_ips_request);
3675 reload_all_ips_request = NULL;
3678 /* if there are takeovers requested, perform it and notify the waiters */
3679 if (rec->reallocate_callers) {
3680 process_ipreallocate_requests(ctdb, rec);
3683 /* get the nodemap for all active remote nodes
3685 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3686 if (remote_nodemaps == NULL) {
3687 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3690 for(i=0; i<nodemap->num; i++) {
3691 remote_nodemaps[i] = NULL;
3693 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3694 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3698 /* verify that all other nodes have the same nodemap as we have
3700 for (j=0; j<nodemap->num; j++) {
3701 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3705 if (remote_nodemaps[j] == NULL) {
3706 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3707 ctdb_set_culprit(rec, j);
3712 /* if the nodes disagree on how many nodes there are
3713 then this is a good reason to try recovery
3715 if (remote_nodemaps[j]->num != nodemap->num) {
3716 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3717 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3718 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3719 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3723 /* if the nodes disagree on which nodes exist and are
3724 active, then that is also a good reason to do recovery
3726 for (i=0;i<nodemap->num;i++) {
3727 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3728 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3729 nodemap->nodes[j].pnn, i,
3730 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3731 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3732 do_recovery(rec, mem_ctx, pnn, nodemap,
3740 * Update node flags obtained from each active node. This ensure we have
3741 * up-to-date information for all the nodes.
3743 for (j=0; j<nodemap->num; j++) {
3744 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3747 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3750 for (j=0; j<nodemap->num; j++) {
3751 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3755 /* verify the flags are consistent
3757 for (i=0; i<nodemap->num; i++) {
3758 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3762 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3763 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3764 nodemap->nodes[j].pnn,
3765 nodemap->nodes[i].pnn,
3766 remote_nodemaps[j]->nodes[i].flags,
3767 nodemap->nodes[i].flags));
3769 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3770 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3771 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3772 do_recovery(rec, mem_ctx, pnn, nodemap,
3776 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3777 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3778 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3779 do_recovery(rec, mem_ctx, pnn, nodemap,
3788 /* there better be the same number of lmasters in the vnn map
3789 as there are active nodes or we will have to do a recovery
3791 if (vnnmap->size != rec->num_active) {
3792 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3793 vnnmap->size, rec->num_active));
3794 ctdb_set_culprit(rec, ctdb->pnn);
3795 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3799 /* verify that all active nodes in the nodemap also exist in
3802 for (j=0; j<nodemap->num; j++) {
3803 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3806 if (nodemap->nodes[j].pnn == pnn) {
3810 for (i=0; i<vnnmap->size; i++) {
3811 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3815 if (i == vnnmap->size) {
3816 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3817 nodemap->nodes[j].pnn));
3818 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3819 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3825 /* verify that all other nodes have the same vnnmap
3826 and are from the same generation
3828 for (j=0; j<nodemap->num; j++) {
3829 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3832 if (nodemap->nodes[j].pnn == pnn) {
3836 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3837 mem_ctx, &remote_vnnmap);
3839 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3840 nodemap->nodes[j].pnn));
3844 /* verify the vnnmap generation is the same */
3845 if (vnnmap->generation != remote_vnnmap->generation) {
3846 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3847 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3848 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3849 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3853 /* verify the vnnmap size is the same */
3854 if (vnnmap->size != remote_vnnmap->size) {
3855 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3856 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3857 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3858 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3862 /* verify the vnnmap is the same */
3863 for (i=0;i<vnnmap->size;i++) {
3864 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3865 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3866 nodemap->nodes[j].pnn));
3867 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3868 do_recovery(rec, mem_ctx, pnn, nodemap,
3875 /* we might need to change who has what IP assigned */
3876 if (rec->need_takeover_run) {
3877 uint32_t culprit = (uint32_t)-1;
3879 rec->need_takeover_run = false;
3881 /* update the list of public ips that a node can handle for
3884 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3886 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3888 rec->need_takeover_run = true;
3892 /* execute the "startrecovery" event script on all nodes */
3893 ret = run_startrecovery_eventscript(rec, nodemap);
3895 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3896 ctdb_set_culprit(rec, ctdb->pnn);
3897 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3901 /* If takeover run fails, then the offending nodes are
3902 * assigned ban culprit counts. And we re-try takeover.
3903 * If takeover run fails repeatedly, the node would get
3906 * If rec->need_takeover_run is not set to true at this
3907 * failure, monitoring is disabled cluster-wide (via
3908 * startrecovery eventscript) and will not get enabled.
3910 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
3912 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
3916 /* execute the "recovered" event script on all nodes */
3917 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3919 // we cant check whether the event completed successfully
3920 // since this script WILL fail if the node is in recovery mode
3921 // and if that race happens, the code here would just cause a second
3922 // cascading recovery.
3924 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3925 ctdb_set_culprit(rec, ctdb->pnn);
3926 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3933 the main monitoring loop
3935 static void monitor_cluster(struct ctdb_context *ctdb)
3937 struct ctdb_recoverd *rec;
3939 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3941 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3942 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3946 rec->priority_time = timeval_current();
3948 /* register a message port for sending memory dumps */
3949 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3951 /* register a message port for requesting logs */
3952 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
3954 /* register a message port for clearing logs */
3955 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
3957 /* register a message port for recovery elections */
3958 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3960 /* when nodes are disabled/enabled */
3961 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3963 /* when we are asked to puch out a flag change */
3964 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3966 /* register a message port for vacuum fetch */
3967 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3969 /* register a message port for reloadnodes */
3970 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3972 /* register a message port for performing a takeover run */
3973 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3975 /* register a message port for performing a reload all ips */
3976 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
3978 /* register a message port for disabling the ip check for a short while */
3979 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3981 /* register a message port for updating the recovery daemons node assignment for an ip */
3982 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3984 /* register a message port for forcing a rebalance of a node next
3986 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3989 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3990 struct timeval start;
3994 DEBUG(DEBUG_CRIT,(__location__
3995 " Failed to create temp context\n"));
3999 start = timeval_current();
4000 main_loop(ctdb, rec, mem_ctx);
4001 talloc_free(mem_ctx);
4003 /* we only check for recovery once every second */
4004 elapsed = timeval_elapsed(&start);
4005 if (elapsed < ctdb->tunable.recover_interval) {
4006 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4013 event handler for when the main ctdbd dies
4015 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4016 uint16_t flags, void *private_data)
4018 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4023 called regularly to verify that the recovery daemon is still running
4025 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4026 struct timeval yt, void *p)
4028 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4030 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4031 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4033 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4034 ctdb_restart_recd, ctdb);
4039 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4040 timeval_current_ofs(30, 0),
4041 ctdb_check_recd, ctdb);
4044 static void recd_sig_child_handler(struct event_context *ev,
4045 struct signal_event *se, int signum, int count,
4049 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4054 pid = waitpid(-1, &status, WNOHANG);
4056 if (errno != ECHILD) {
4057 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4062 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4068 startup the recovery daemon as a child of the main ctdb daemon
4070 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4073 struct signal_event *se;
4074 struct tevent_fd *fde;
4076 if (pipe(fd) != 0) {
4080 ctdb->ctdbd_pid = getpid();
4082 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4083 if (ctdb->recoverd_pid == -1) {
4087 if (ctdb->recoverd_pid != 0) {
4088 talloc_free(ctdb->recd_ctx);
4089 ctdb->recd_ctx = talloc_new(ctdb);
4090 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4093 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4094 timeval_current_ofs(30, 0),
4095 ctdb_check_recd, ctdb);
4101 srandom(getpid() ^ time(NULL));
4103 /* Clear the log ringbuffer */
4104 ctdb_clear_log(ctdb);
4106 ctdb_set_process_name("ctdb_recovered");
4107 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4108 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4112 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4114 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4115 ctdb_recoverd_parent, &fd[0]);
4116 tevent_fd_set_auto_close(fde);
4118 /* set up a handler to pick up sigchld */
4119 se = event_add_signal(ctdb->ev, ctdb,
4121 recd_sig_child_handler,
4124 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4128 monitor_cluster(ctdb);
4130 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4135 shutdown the recovery daemon
4137 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4139 if (ctdb->recoverd_pid == 0) {
4143 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4144 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4146 TALLOC_FREE(ctdb->recd_ctx);
4147 TALLOC_FREE(ctdb->recd_ping_count);
4150 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4151 struct timeval t, void *private_data)
4153 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4155 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4156 ctdb_stop_recoverd(ctdb);
4157 ctdb_start_recoverd(ctdb);