4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
36 #include "ctdb_private.h"
37 #include "ctdb_client.h"
39 #include "common/system.h"
40 #include "common/cmdline.h"
41 #include "common/common.h"
42 #include "common/logging.h"
45 /* List of SRVID requests that need to be processed */
47 struct srvid_list *next, *prev;
48 struct ctdb_srvid_message *request;
51 struct srvid_requests {
52 struct srvid_list *requests;
55 static void srvid_request_reply(struct ctdb_context *ctdb,
56 struct ctdb_srvid_message *request,
59 /* Someone that sent srvid==0 does not want a reply */
60 if (request->srvid == 0) {
65 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
67 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
68 (unsigned)request->pnn,
69 (unsigned long long)request->srvid));
71 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
79 static void srvid_requests_reply(struct ctdb_context *ctdb,
80 struct srvid_requests **requests,
85 for (r = (*requests)->requests; r != NULL; r = r->next) {
86 srvid_request_reply(ctdb, r->request, result);
89 /* Free the list structure... */
90 TALLOC_FREE(*requests);
93 static void srvid_request_add(struct ctdb_context *ctdb,
94 struct srvid_requests **requests,
95 struct ctdb_srvid_message *request)
101 if (*requests == NULL) {
102 *requests = talloc_zero(ctdb, struct srvid_requests);
103 if (*requests == NULL) {
108 t = talloc_zero(*requests, struct srvid_list);
110 /* If *requests was just allocated above then free it */
111 if ((*requests)->requests == NULL) {
112 TALLOC_FREE(*requests);
117 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
118 DLIST_ADD((*requests)->requests, t);
123 /* Failed to add the request to the list. Send a fail. */
124 DEBUG(DEBUG_ERR, (__location__
125 " Out of memory, failed to queue SRVID request\n"));
127 result.dsize = sizeof(ret);
128 result.dptr = (uint8_t *)&ret;
129 srvid_request_reply(ctdb, request, result);
132 /* An abstraction to allow an operation (takeover runs, recoveries,
133 * ...) to be disabled for a given timeout */
134 struct ctdb_op_state {
135 struct tevent_timer *timer;
140 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
142 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
145 state->in_progress = false;
152 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
154 return state->timer != NULL;
157 static bool ctdb_op_begin(struct ctdb_op_state *state)
159 if (ctdb_op_is_disabled(state)) {
161 ("Unable to begin - %s are disabled\n", state->name));
165 state->in_progress = true;
169 static bool ctdb_op_end(struct ctdb_op_state *state)
171 return state->in_progress = false;
174 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
176 return state->in_progress;
179 static void ctdb_op_enable(struct ctdb_op_state *state)
181 TALLOC_FREE(state->timer);
184 static void ctdb_op_timeout_handler(struct tevent_context *ev,
185 struct tevent_timer *te,
186 struct timeval yt, void *p)
188 struct ctdb_op_state *state =
189 talloc_get_type(p, struct ctdb_op_state);
191 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
192 ctdb_op_enable(state);
195 static int ctdb_op_disable(struct ctdb_op_state *state,
196 struct tevent_context *ev,
200 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
201 ctdb_op_enable(state);
205 if (state->in_progress) {
207 ("Unable to disable %s - in progress\n", state->name));
211 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
212 state->name, timeout));
214 /* Clear any old timers */
215 talloc_free(state->timer);
217 /* Arrange for the timeout to occur */
218 state->timer = tevent_add_timer(ev, state,
219 timeval_current_ofs(timeout, 0),
220 ctdb_op_timeout_handler, state);
221 if (state->timer == NULL) {
222 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
229 struct ctdb_banning_state {
231 struct timeval last_reported_time;
235 private state of recovery daemon
237 struct ctdb_recoverd {
238 struct ctdb_context *ctdb;
240 uint32_t last_culprit_node;
241 struct ctdb_node_map_old *nodemap;
242 struct timeval priority_time;
243 bool need_takeover_run;
246 struct tevent_timer *send_election_te;
247 struct tevent_timer *election_timeout;
248 struct srvid_requests *reallocate_requests;
249 struct ctdb_op_state *takeover_run;
250 struct ctdb_op_state *recovery;
251 struct ctdb_iface_list_old *ifaces;
252 uint32_t *force_rebalance_nodes;
253 struct ctdb_node_capabilities *caps;
256 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
257 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
259 static void ctdb_restart_recd(struct tevent_context *ev,
260 struct tevent_timer *te, struct timeval t,
264 ban a node for a period of time
266 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
269 struct ctdb_context *ctdb = rec->ctdb;
270 struct ctdb_ban_state bantime;
272 if (!ctdb_validate_pnn(ctdb, pnn)) {
273 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
277 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
280 bantime.time = ban_time;
282 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
284 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
290 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
294 remember the trouble maker
296 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
298 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
299 struct ctdb_banning_state *ban_state;
301 if (culprit > ctdb->num_nodes) {
302 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
306 /* If we are banned or stopped, do not set other nodes as culprits */
307 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
308 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
312 if (ctdb->nodes[culprit]->ban_state == NULL) {
313 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
314 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
318 ban_state = ctdb->nodes[culprit]->ban_state;
319 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
320 /* this was the first time in a long while this node
321 misbehaved so we will forgive any old transgressions.
323 ban_state->count = 0;
326 ban_state->count += count;
327 ban_state->last_reported_time = timeval_current();
328 rec->last_culprit_node = culprit;
332 remember the trouble maker
334 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
336 ctdb_set_culprit_count(rec, culprit, 1);
340 /* this callback is called for every node that failed to execute the
343 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
345 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
347 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
349 ctdb_set_culprit(rec, node_pnn);
353 run the "recovered" eventscript on all nodes
355 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
359 struct ctdb_context *ctdb = rec->ctdb;
361 tmp_ctx = talloc_new(ctdb);
362 CTDB_NO_MEMORY(ctdb, tmp_ctx);
364 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
365 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
367 CONTROL_TIMEOUT(), false, tdb_null,
368 NULL, recovered_fail_callback,
370 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
372 talloc_free(tmp_ctx);
376 talloc_free(tmp_ctx);
380 /* this callback is called for every node that failed to execute the
383 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
385 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
387 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
389 ctdb_set_culprit(rec, node_pnn);
393 run the "startrecovery" eventscript on all nodes
395 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
399 struct ctdb_context *ctdb = rec->ctdb;
401 tmp_ctx = talloc_new(ctdb);
402 CTDB_NO_MEMORY(ctdb, tmp_ctx);
404 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
405 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
407 CONTROL_TIMEOUT(), false, tdb_null,
409 startrecovery_fail_callback,
411 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
412 talloc_free(tmp_ctx);
416 talloc_free(tmp_ctx);
421 Retrieve capabilities from all connected nodes
423 static int update_capabilities(struct ctdb_recoverd *rec,
424 struct ctdb_node_map_old *nodemap)
428 struct ctdb_node_capabilities *caps;
429 struct ctdb_context *ctdb = rec->ctdb;
431 tmp_ctx = talloc_new(rec);
432 CTDB_NO_MEMORY(ctdb, tmp_ctx);
434 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
435 CONTROL_TIMEOUT(), nodemap);
439 (__location__ " Failed to get node capabilities\n"));
440 talloc_free(tmp_ctx);
444 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
448 " Capabilities don't include current node.\n"));
449 talloc_free(tmp_ctx);
452 ctdb->capabilities = *capp;
454 TALLOC_FREE(rec->caps);
455 rec->caps = talloc_steal(rec, caps);
457 talloc_free(tmp_ctx);
461 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
463 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
465 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
466 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
469 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
471 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
473 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
474 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
478 change recovery mode on all nodes
480 static int set_recovery_mode(struct ctdb_context *ctdb,
481 struct ctdb_recoverd *rec,
482 struct ctdb_node_map_old *nodemap,
483 uint32_t rec_mode, bool freeze)
489 tmp_ctx = talloc_new(ctdb);
490 CTDB_NO_MEMORY(ctdb, tmp_ctx);
492 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
494 data.dsize = sizeof(uint32_t);
495 data.dptr = (unsigned char *)&rec_mode;
497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
503 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
504 talloc_free(tmp_ctx);
508 /* freeze all nodes */
509 if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
512 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
513 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
518 set_recmode_fail_callback,
520 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
521 talloc_free(tmp_ctx);
527 talloc_free(tmp_ctx);
531 /* update all remote nodes to use the same db priority that we have
532 this can fail if the remove node has not yet been upgraded to
533 support this function, so we always return success and never fail
534 a recovery if this call fails.
536 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
537 struct ctdb_node_map_old *nodemap,
538 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
542 /* step through all local databases */
543 for (db=0; db<dbmap->num;db++) {
544 struct ctdb_db_priority db_prio;
547 db_prio.db_id = dbmap->dbs[db].db_id;
548 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
550 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
554 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
556 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
557 CTDB_CURRENT_NODE, &db_prio);
559 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
568 ensure all other nodes have attached to any databases that we have
570 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
571 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
574 struct ctdb_dbid_map_old *remote_dbmap;
576 /* verify that all other nodes have all our databases */
577 for (j=0; j<nodemap->num; j++) {
578 /* we don't need to ourself ourselves */
579 if (nodemap->nodes[j].pnn == pnn) {
582 /* don't check nodes that are unavailable */
583 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
587 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
588 mem_ctx, &remote_dbmap);
590 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
594 /* step through all local databases */
595 for (db=0; db<dbmap->num;db++) {
599 for (i=0;i<remote_dbmap->num;i++) {
600 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
604 /* the remote node already have this database */
605 if (i!=remote_dbmap->num) {
608 /* ok so we need to create this database */
609 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
610 dbmap->dbs[db].db_id, mem_ctx,
613 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
616 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
617 nodemap->nodes[j].pnn,
619 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
621 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
632 ensure we are attached to any databases that anyone else is attached to
634 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
635 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
638 struct ctdb_dbid_map_old *remote_dbmap;
640 /* verify that we have all database any other node has */
641 for (j=0; j<nodemap->num; j++) {
642 /* we don't need to ourself ourselves */
643 if (nodemap->nodes[j].pnn == pnn) {
646 /* don't check nodes that are unavailable */
647 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
651 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
652 mem_ctx, &remote_dbmap);
654 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
658 /* step through all databases on the remote node */
659 for (db=0; db<remote_dbmap->num;db++) {
662 for (i=0;i<(*dbmap)->num;i++) {
663 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
667 /* we already have this db locally */
668 if (i!=(*dbmap)->num) {
671 /* ok so we need to create this database and
674 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
675 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
677 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
678 nodemap->nodes[j].pnn));
681 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
682 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
684 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
687 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
689 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
700 pull the remote database contents from one node into the recdb
702 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
703 struct tdb_wrap *recdb, uint32_t dbid)
707 struct ctdb_marshall_buffer *reply;
708 struct ctdb_rec_data_old *recdata;
710 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
712 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
713 CONTROL_TIMEOUT(), &outdata);
715 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
716 talloc_free(tmp_ctx);
720 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
722 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
723 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
724 talloc_free(tmp_ctx);
728 recdata = (struct ctdb_rec_data_old *)&reply->data[0];
732 recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
734 struct ctdb_ltdb_header *hdr;
737 key.dptr = &recdata->data[0];
738 key.dsize = recdata->keylen;
739 data.dptr = &recdata->data[key.dsize];
740 data.dsize = recdata->datalen;
742 hdr = (struct ctdb_ltdb_header *)data.dptr;
744 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
745 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
746 talloc_free(tmp_ctx);
750 /* fetch the existing record, if any */
751 existing = tdb_fetch(recdb->tdb, key);
753 if (existing.dptr != NULL) {
754 struct ctdb_ltdb_header header;
755 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
756 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
757 (unsigned)existing.dsize, srcnode));
759 talloc_free(tmp_ctx);
762 header = *(struct ctdb_ltdb_header *)existing.dptr;
764 if (!(header.rsn < hdr->rsn ||
765 (header.dmaster != ctdb_get_pnn(ctdb) &&
766 header.rsn == hdr->rsn))) {
771 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
772 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
773 talloc_free(tmp_ctx);
778 talloc_free(tmp_ctx);
784 struct pull_seqnum_cbdata {
790 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
792 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
795 if (cb_data->failed != 0) {
796 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
801 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
806 if (outdata.dsize != sizeof(uint64_t)) {
807 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
808 cb_data->failed = -1;
812 seqnum = *((uint64_t *)outdata.dptr);
814 if (seqnum > cb_data->seqnum ||
815 (cb_data->pnn == -1 && seqnum == 0)) {
816 cb_data->seqnum = seqnum;
817 cb_data->pnn = node_pnn;
821 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
823 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
825 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
829 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
830 struct ctdb_recoverd *rec,
831 struct ctdb_node_map_old *nodemap,
832 struct tdb_wrap *recdb, uint32_t dbid)
834 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
838 struct pull_seqnum_cbdata *cb_data;
840 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
845 data.dsize = sizeof(outdata);
846 data.dptr = (uint8_t *)&outdata[0];
848 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
849 if (cb_data == NULL) {
850 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
851 talloc_free(tmp_ctx);
859 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
860 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
862 CONTROL_TIMEOUT(), false, data,
866 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
868 talloc_free(tmp_ctx);
872 if (cb_data->failed != 0) {
873 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
874 talloc_free(tmp_ctx);
878 if (cb_data->pnn == -1) {
879 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
880 talloc_free(tmp_ctx);
884 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
886 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
887 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
888 talloc_free(tmp_ctx);
892 talloc_free(tmp_ctx);
898 pull all the remote database contents into the recdb
900 static int pull_remote_database(struct ctdb_context *ctdb,
901 struct ctdb_recoverd *rec,
902 struct ctdb_node_map_old *nodemap,
903 struct tdb_wrap *recdb, uint32_t dbid,
908 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
910 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
916 /* pull all records from all other nodes across onto this node
917 (this merges based on rsn)
919 for (j=0; j<nodemap->num; j++) {
920 /* don't merge from nodes that are unavailable */
921 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
924 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
925 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
926 nodemap->nodes[j].pnn));
927 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
937 update flags on all active nodes
939 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
943 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
945 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
953 ensure all nodes have the same vnnmap we do
955 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
956 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
960 /* push the new vnn map out to all the nodes */
961 for (j=0; j<nodemap->num; j++) {
962 /* don't push to nodes that are unavailable */
963 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
967 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
969 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
979 called when a vacuum fetch has completed - just free it and do the next one
981 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
988 * Process one elements of the vacuum fetch list:
989 * Migrate it over to us with the special flag
990 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
992 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
994 struct ctdb_rec_data_old *r)
996 struct ctdb_client_call_state *state;
998 struct ctdb_ltdb_header *hdr;
999 struct ctdb_call call;
1002 call.call_id = CTDB_NULL_FUNC;
1003 call.flags = CTDB_IMMEDIATE_MIGRATION;
1004 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1006 call.key.dptr = &r->data[0];
1007 call.key.dsize = r->keylen;
1009 /* ensure we don't block this daemon - just skip a record if we can't get
1011 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1015 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1016 if (data.dptr == NULL) {
1017 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1021 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1023 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1027 hdr = (struct ctdb_ltdb_header *)data.dptr;
1028 if (hdr->dmaster == pnn) {
1029 /* its already local */
1031 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1037 state = ctdb_call_send(ctdb_db, &call);
1038 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1039 if (state == NULL) {
1040 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1043 state->async.fn = vacuum_fetch_callback;
1044 state->async.private_data = NULL;
1051 handler for vacuum fetch
1053 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1056 struct ctdb_recoverd *rec = talloc_get_type(
1057 private_data, struct ctdb_recoverd);
1058 struct ctdb_context *ctdb = rec->ctdb;
1059 struct ctdb_marshall_buffer *recs;
1061 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1063 struct ctdb_dbid_map_old *dbmap=NULL;
1064 bool persistent = false;
1065 struct ctdb_db_context *ctdb_db;
1066 struct ctdb_rec_data_old *r;
1068 recs = (struct ctdb_marshall_buffer *)data.dptr;
1070 if (recs->count == 0) {
1074 /* work out if the database is persistent */
1075 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1077 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1081 for (i=0;i<dbmap->num;i++) {
1082 if (dbmap->dbs[i].db_id == recs->db_id) {
1083 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1087 if (i == dbmap->num) {
1088 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1092 /* find the name of this database */
1093 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1094 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1099 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1100 if (ctdb_db == NULL) {
1101 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1105 r = (struct ctdb_rec_data_old *)&recs->data[0];
1106 while (recs->count) {
1109 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1114 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1119 talloc_free(tmp_ctx);
1124 * handler for database detach
1126 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1129 struct ctdb_recoverd *rec = talloc_get_type(
1130 private_data, struct ctdb_recoverd);
1131 struct ctdb_context *ctdb = rec->ctdb;
1133 struct ctdb_db_context *ctdb_db;
1135 if (data.dsize != sizeof(db_id)) {
1138 db_id = *(uint32_t *)data.dptr;
1140 ctdb_db = find_ctdb_db(ctdb, db_id);
1141 if (ctdb_db == NULL) {
1142 /* database is not attached */
1146 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1148 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1150 talloc_free(ctdb_db);
1154 called when ctdb_wait_timeout should finish
1156 static void ctdb_wait_handler(struct tevent_context *ev,
1157 struct tevent_timer *te,
1158 struct timeval yt, void *p)
1160 uint32_t *timed_out = (uint32_t *)p;
1165 wait for a given number of seconds
1167 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1169 uint32_t timed_out = 0;
1170 time_t usecs = (secs - (time_t)secs) * 1000000;
1171 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1172 ctdb_wait_handler, &timed_out);
1173 while (!timed_out) {
1174 tevent_loop_once(ctdb->ev);
1179 called when an election times out (ends)
1181 static void ctdb_election_timeout(struct tevent_context *ev,
1182 struct tevent_timer *te,
1183 struct timeval t, void *p)
1185 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1186 rec->election_timeout = NULL;
1189 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1194 wait for an election to finish. It finished election_timeout seconds after
1195 the last election packet is received
1197 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1199 struct ctdb_context *ctdb = rec->ctdb;
1200 while (rec->election_timeout) {
1201 tevent_loop_once(ctdb->ev);
1206 Update our local flags from all remote connected nodes.
1207 This is only run when we are or we belive we are the recovery master
1209 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1212 struct ctdb_context *ctdb = rec->ctdb;
1213 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1215 /* get the nodemap for all active remote nodes and verify
1216 they are the same as for this node
1218 for (j=0; j<nodemap->num; j++) {
1219 struct ctdb_node_map_old *remote_nodemap=NULL;
1222 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1225 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1229 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1230 mem_ctx, &remote_nodemap);
1232 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1233 nodemap->nodes[j].pnn));
1234 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1235 talloc_free(mem_ctx);
1236 return MONITOR_FAILED;
1238 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1239 /* We should tell our daemon about this so it
1240 updates its flags or else we will log the same
1241 message again in the next iteration of recovery.
1242 Since we are the recovery master we can just as
1243 well update the flags on all nodes.
1245 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1247 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1251 /* Update our local copy of the flags in the recovery
1254 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1255 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1256 nodemap->nodes[j].flags));
1257 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1259 talloc_free(remote_nodemap);
1261 talloc_free(mem_ctx);
1266 /* Create a new random generation id.
1267 The generation id can not be the INVALID_GENERATION id
1269 static uint32_t new_generation(void)
1271 uint32_t generation;
1274 generation = random();
1276 if (generation != INVALID_GENERATION) {
1286 create a temporary working database
1288 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1291 struct tdb_wrap *recdb;
1294 /* open up the temporary recovery database */
1295 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1296 ctdb->db_directory_state,
1303 tdb_flags = TDB_NOLOCK;
1304 if (ctdb->valgrinding) {
1305 tdb_flags |= TDB_NOMMAP;
1307 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1309 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1310 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1311 if (recdb == NULL) {
1312 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1322 a traverse function for pulling all relevant records from recdb
1325 struct ctdb_context *ctdb;
1326 struct ctdb_marshall_buffer *recdata;
1328 uint32_t allocated_len;
1333 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1335 struct recdb_data *params = (struct recdb_data *)p;
1336 struct ctdb_rec_data_old *recdata;
1337 struct ctdb_ltdb_header *hdr;
1340 * skip empty records - but NOT for persistent databases:
1342 * The record-by-record mode of recovery deletes empty records.
1343 * For persistent databases, this can lead to data corruption
1344 * by deleting records that should be there:
1346 * - Assume the cluster has been running for a while.
1348 * - A record R in a persistent database has been created and
1349 * deleted a couple of times, the last operation being deletion,
1350 * leaving an empty record with a high RSN, say 10.
1352 * - Now a node N is turned off.
1354 * - This leaves the local database copy of D on N with the empty
1355 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1356 * the copy of record R.
1358 * - Now the record is created again while node N is turned off.
1359 * This creates R with RSN = 1 on all nodes except for N.
1361 * - Now node N is turned on again. The following recovery will chose
1362 * the older empty copy of R due to RSN 10 > RSN 1.
1364 * ==> Hence the record is gone after the recovery.
1366 * On databases like Samba's registry, this can damage the higher-level
1367 * data structures built from the various tdb-level records.
1369 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1373 /* update the dmaster field to point to us */
1374 hdr = (struct ctdb_ltdb_header *)data.dptr;
1375 if (!params->persistent) {
1376 hdr->dmaster = params->ctdb->pnn;
1377 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1380 /* add the record to the blob ready to send to the nodes */
1381 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1382 if (recdata == NULL) {
1383 params->failed = true;
1386 if (params->len + recdata->length >= params->allocated_len) {
1387 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1388 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1390 if (params->recdata == NULL) {
1391 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1392 recdata->length + params->len));
1393 params->failed = true;
1396 params->recdata->count++;
1397 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1398 params->len += recdata->length;
1399 talloc_free(recdata);
1405 push the recdb database out to all nodes
1407 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1409 struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1411 struct recdb_data params;
1412 struct ctdb_marshall_buffer *recdata;
1414 TALLOC_CTX *tmp_ctx;
1417 tmp_ctx = talloc_new(ctdb);
1418 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1420 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1421 CTDB_NO_MEMORY(ctdb, recdata);
1423 recdata->db_id = dbid;
1426 params.recdata = recdata;
1427 params.len = offsetof(struct ctdb_marshall_buffer, data);
1428 params.allocated_len = params.len;
1429 params.failed = false;
1430 params.persistent = persistent;
1432 if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
1433 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1434 talloc_free(params.recdata);
1435 talloc_free(tmp_ctx);
1439 if (params.failed) {
1440 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1441 talloc_free(params.recdata);
1442 talloc_free(tmp_ctx);
1446 recdata = params.recdata;
1448 outdata.dptr = (void *)recdata;
1449 outdata.dsize = params.len;
1451 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1452 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1454 CONTROL_TIMEOUT(), false, outdata,
1457 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1458 talloc_free(recdata);
1459 talloc_free(tmp_ctx);
1463 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1464 dbid, recdata->count));
1466 talloc_free(recdata);
1467 talloc_free(tmp_ctx);
1474 go through a full recovery on one database
1476 static int recover_database(struct ctdb_recoverd *rec,
1477 TALLOC_CTX *mem_ctx,
1481 struct ctdb_node_map_old *nodemap,
1482 uint32_t transaction_id)
1484 struct tdb_wrap *recdb;
1486 struct ctdb_context *ctdb = rec->ctdb;
1488 struct ctdb_transdb w;
1491 recdb = create_recdb(ctdb, mem_ctx);
1492 if (recdb == NULL) {
1496 /* pull all remote databases onto the recdb */
1497 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1499 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1503 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1505 /* wipe all the remote databases. This is safe as we are in a transaction */
1507 w.tid = transaction_id;
1509 data.dptr = (void *)&w;
1510 data.dsize = sizeof(w);
1512 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1513 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1515 CONTROL_TIMEOUT(), false, data,
1518 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1523 /* push out the correct database. This sets the dmaster and skips
1524 the empty records */
1525 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1531 /* all done with this database */
1537 /* when we start a recovery, make sure all nodes use the same reclock file
1540 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1542 struct ctdb_context *ctdb = rec->ctdb;
1543 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1547 if (ctdb->recovery_lock_file == NULL) {
1551 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1552 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1555 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1556 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1562 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1563 talloc_free(tmp_ctx);
1567 talloc_free(tmp_ctx);
1573 * this callback is called for every node that failed to execute ctdb_takeover_run()
1574 * and set flag to re-run takeover run.
1576 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1578 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1580 if (callback_data != NULL) {
1581 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1583 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1585 ctdb_set_culprit(rec, node_pnn);
1590 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1592 struct ctdb_context *ctdb = rec->ctdb;
1594 struct ctdb_banning_state *ban_state;
1597 for (i=0; i<ctdb->num_nodes; i++) {
1598 if (ctdb->nodes[i]->ban_state == NULL) {
1601 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1602 if (ban_state->count < 2*ctdb->num_nodes) {
1606 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1607 ctdb->nodes[i]->pnn, ban_state->count,
1608 ctdb->tunable.recovery_ban_period));
1609 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1610 ban_state->count = 0;
1612 /* Banning ourself? */
1613 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1619 static bool do_takeover_run(struct ctdb_recoverd *rec,
1620 struct ctdb_node_map_old *nodemap,
1621 bool banning_credits_on_fail)
1623 uint32_t *nodes = NULL;
1624 struct ctdb_disable_message dtr;
1627 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1631 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1633 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1634 DEBUG(DEBUG_ERR, (__location__
1635 " takeover run already in progress \n"));
1640 if (!ctdb_op_begin(rec->takeover_run)) {
1645 /* Disable IP checks (takeover runs, really) on other nodes
1646 * while doing this takeover run. This will stop those other
1647 * nodes from triggering takeover runs when think they should
1648 * be hosting an IP but it isn't yet on an interface. Don't
1649 * wait for replies since a failure here might cause some
1650 * noise in the logs but will not actually cause a problem.
1652 dtr.srvid = 0; /* No reply */
1655 data.dptr = (uint8_t*)&dtr;
1656 data.dsize = sizeof(dtr);
1658 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1660 /* Disable for 60 seconds. This can be a tunable later if
1664 for (i = 0; i < talloc_array_length(nodes); i++) {
1665 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1666 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1668 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1672 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1673 rec->force_rebalance_nodes,
1674 takeover_fail_callback,
1675 banning_credits_on_fail ? rec : NULL);
1677 /* Reenable takeover runs and IP checks on other nodes */
1679 for (i = 0; i < talloc_array_length(nodes); i++) {
1680 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1681 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1683 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1688 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1694 /* Takeover run was successful so clear force rebalance targets */
1695 if (rebalance_nodes == rec->force_rebalance_nodes) {
1696 TALLOC_FREE(rec->force_rebalance_nodes);
1698 DEBUG(DEBUG_WARNING,
1699 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1702 rec->need_takeover_run = !ok;
1704 ctdb_op_end(rec->takeover_run);
1706 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1710 struct recovery_helper_state {
1717 static void ctdb_recovery_handler(struct tevent_context *ev,
1718 struct tevent_fd *fde,
1719 uint16_t flags, void *private_data)
1721 struct recovery_helper_state *state = talloc_get_type_abort(
1722 private_data, struct recovery_helper_state);
1725 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1726 if (ret != sizeof(state->result)) {
1727 state->result = EPIPE;
1734 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1736 static char prog[PATH_MAX+1] = "";
1738 struct recovery_helper_state *state;
1739 struct tevent_fd *fde;
1742 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1743 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1744 "ctdb_recovery_helper")) {
1745 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1748 state = talloc_zero(mem_ctx, struct recovery_helper_state);
1749 if (state == NULL) {
1750 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1756 ret = pipe(state->fd);
1759 ("Failed to create pipe for recovery helper\n"));
1763 set_close_on_exec(state->fd[0]);
1766 args = talloc_array(state, const char *, nargs);
1768 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1772 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1773 args[1] = rec->ctdb->daemon.name;
1774 args[2] = talloc_asprintf(args, "%u", new_generation());
1777 if (args[0] == NULL || args[2] == NULL) {
1778 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1782 if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1783 args, NULL, NULL, &state->pid)) {
1785 ("Failed to create child for recovery helper\n"));
1789 close(state->fd[1]);
1792 state->done = false;
1794 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1795 TEVENT_FD_READ, ctdb_recovery_handler, state);
1799 tevent_fd_set_auto_close(fde);
1801 while (!state->done) {
1802 tevent_loop_once(rec->ctdb->ev);
1805 close(state->fd[0]);
1808 if (state->result != 0) {
1812 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1817 if (state->fd[0] != -1) {
1818 close(state->fd[0]);
1820 if (state->fd[1] != -1) {
1821 close(state->fd[1]);
1823 if (state->pid != -1) {
1824 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1830 static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1831 uint32_t pnn, struct ctdb_node_map_old *nodemap,
1832 struct ctdb_vnn_map *vnnmap,
1833 struct ctdb_dbid_map_old *dbmap)
1835 struct ctdb_context *ctdb = rec->ctdb;
1836 uint32_t generation;
1841 /* set recovery mode to active on all nodes */
1842 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1844 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1848 /* execute the "startrecovery" event script on all nodes */
1849 ret = run_startrecovery_eventscript(rec, nodemap);
1851 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1855 /* pick a new generation number */
1856 generation = new_generation();
1858 /* change the vnnmap on this node to use the new generation
1859 number but not on any other nodes.
1860 this guarantees that if we abort the recovery prematurely
1861 for some reason (a node stops responding?)
1862 that we can just return immediately and we will reenter
1863 recovery shortly again.
1864 I.e. we deliberately leave the cluster with an inconsistent
1865 generation id to allow us to abort recovery at any stage and
1866 just restart it from scratch.
1868 vnnmap->generation = generation;
1869 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1871 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1875 /* Database generations are updated when the transaction is commited to
1876 * the databases. So make sure to use the final generation as the
1879 generation = new_generation();
1881 data.dptr = (void *)&generation;
1882 data.dsize = sizeof(uint32_t);
1884 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1885 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1887 CONTROL_TIMEOUT(), false, data,
1889 transaction_start_fail_callback,
1891 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1892 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1894 CONTROL_TIMEOUT(), false, tdb_null,
1898 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1903 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1905 for (i=0;i<dbmap->num;i++) {
1906 ret = recover_database(rec, mem_ctx,
1907 dbmap->dbs[i].db_id,
1908 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1909 pnn, nodemap, generation);
1911 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1916 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1918 /* commit all the changes */
1919 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1921 CONTROL_TIMEOUT(), false, data,
1924 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1928 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1930 /* build a new vnn map with all the currently active and
1932 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1933 CTDB_NO_MEMORY(ctdb, vnnmap);
1934 vnnmap->generation = generation;
1936 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1937 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1938 for (i=j=0;i<nodemap->num;i++) {
1939 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1942 if (!ctdb_node_has_capabilities(rec->caps,
1943 ctdb->nodes[i]->pnn,
1944 CTDB_CAP_LMASTER)) {
1945 /* this node can not be an lmaster */
1946 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1951 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1952 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1953 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1956 if (vnnmap->size == 0) {
1957 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1959 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1960 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1961 vnnmap->map[0] = pnn;
1964 /* update to the new vnnmap on all nodes */
1965 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1967 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1971 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1973 /* disable recovery mode */
1974 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
1976 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1980 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1986 we are the recmaster, and recovery is needed - start a recovery run
1988 static int do_recovery(struct ctdb_recoverd *rec,
1989 TALLOC_CTX *mem_ctx, uint32_t pnn,
1990 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1992 struct ctdb_context *ctdb = rec->ctdb;
1994 struct ctdb_dbid_map_old *dbmap;
1995 struct timeval start_time;
1999 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2001 /* Check if the current node is still the recmaster. It's possible that
2002 * re-election has changed the recmaster.
2004 if (pnn != rec->recmaster) {
2006 ("Recovery master changed to %u, aborting recovery\n",
2011 /* if recovery fails, force it again */
2012 rec->need_recovery = true;
2014 if (!ctdb_op_begin(rec->recovery)) {
2018 if (rec->election_timeout) {
2019 /* an election is in progress */
2020 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2024 ban_misbehaving_nodes(rec, &self_ban);
2026 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2030 if (ctdb->recovery_lock_file != NULL) {
2031 if (ctdb_recovery_have_lock(ctdb)) {
2032 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2034 start_time = timeval_current();
2035 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2036 ctdb->recovery_lock_file));
2037 if (!ctdb_recovery_lock(ctdb)) {
2038 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2039 /* If ctdb is trying first recovery, it's
2040 * possible that current node does not know
2041 * yet who the recmaster is.
2043 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2044 " - retrying recovery\n"));
2048 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2049 "and ban ourself for %u seconds\n",
2050 ctdb->tunable.recovery_ban_period));
2051 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2054 ctdb_ctrl_report_recd_lock_latency(ctdb,
2056 timeval_elapsed(&start_time));
2058 ("Recovery lock taken successfully by recovery daemon\n"));
2062 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2064 /* get a list of all databases */
2065 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2067 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2071 /* we do the db creation before we set the recovery mode, so the freeze happens
2072 on all databases we will be dealing with. */
2074 /* verify that we have all the databases any other node has */
2075 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2077 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2081 /* verify that all other nodes have all our databases */
2082 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2084 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2087 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2089 /* update the database priority for all remote databases */
2090 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2092 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2094 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2097 /* update all other nodes to use the same setting for reclock files
2098 as the local recovery master.
2100 sync_recovery_lock_file_across_cluster(rec);
2102 /* Retrieve capabilities from all connected nodes */
2103 ret = update_capabilities(rec, nodemap);
2105 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2110 update all nodes to have the same flags that we have
2112 for (i=0;i<nodemap->num;i++) {
2113 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2117 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2119 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2120 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2122 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2128 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2130 /* Check if all participating nodes have parallel recovery capability */
2131 par_recovery = true;
2132 for (i=0; i<nodemap->num; i++) {
2133 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2137 if (!(rec->caps[i].capabilities &
2138 CTDB_CAP_PARALLEL_RECOVERY)) {
2139 par_recovery = false;
2145 ret = db_recovery_parallel(rec, mem_ctx);
2147 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2155 do_takeover_run(rec, nodemap, false);
2157 /* execute the "recovered" event script on all nodes */
2158 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2160 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2164 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2166 /* send a message to all clients telling them that the cluster
2167 has been reconfigured */
2168 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2169 CTDB_SRVID_RECONFIGURE, tdb_null);
2171 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2175 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2177 rec->need_recovery = false;
2178 ctdb_op_end(rec->recovery);
2180 /* we managed to complete a full recovery, make sure to forgive
2181 any past sins by the nodes that could now participate in the
2184 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2185 for (i=0;i<nodemap->num;i++) {
2186 struct ctdb_banning_state *ban_state;
2188 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2192 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2193 if (ban_state == NULL) {
2197 ban_state->count = 0;
2200 /* We just finished a recovery successfully.
2201 We now wait for rerecovery_timeout before we allow
2202 another recovery to take place.
2204 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2205 ctdb_op_disable(rec->recovery, ctdb->ev,
2206 ctdb->tunable.rerecovery_timeout);
2210 ctdb_op_end(rec->recovery);
2216 elections are won by first checking the number of connected nodes, then
2217 the priority time, then the pnn
2219 struct election_message {
2220 uint32_t num_connected;
2221 struct timeval priority_time;
2223 uint32_t node_flags;
2227 form this nodes election data
2229 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2232 struct ctdb_node_map_old *nodemap;
2233 struct ctdb_context *ctdb = rec->ctdb;
2237 em->pnn = rec->ctdb->pnn;
2238 em->priority_time = rec->priority_time;
2240 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2242 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2246 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2247 em->node_flags = rec->node_flags;
2249 for (i=0;i<nodemap->num;i++) {
2250 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2251 em->num_connected++;
2255 /* we shouldnt try to win this election if we cant be a recmaster */
2256 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2257 em->num_connected = 0;
2258 em->priority_time = timeval_current();
2261 talloc_free(nodemap);
2265 see if the given election data wins
2267 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2269 struct election_message myem;
2272 ctdb_election_data(rec, &myem);
2274 /* we cant win if we don't have the recmaster capability */
2275 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2279 /* we cant win if we are banned */
2280 if (rec->node_flags & NODE_FLAGS_BANNED) {
2284 /* we cant win if we are stopped */
2285 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2289 /* we will automatically win if the other node is banned */
2290 if (em->node_flags & NODE_FLAGS_BANNED) {
2294 /* we will automatically win if the other node is banned */
2295 if (em->node_flags & NODE_FLAGS_STOPPED) {
2299 /* then the longest running node */
2301 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2305 cmp = (int)myem.pnn - (int)em->pnn;
2312 send out an election request
2314 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2317 TDB_DATA election_data;
2318 struct election_message emsg;
2320 struct ctdb_context *ctdb = rec->ctdb;
2322 srvid = CTDB_SRVID_ELECTION;
2324 ctdb_election_data(rec, &emsg);
2326 election_data.dsize = sizeof(struct election_message);
2327 election_data.dptr = (unsigned char *)&emsg;
2330 /* first we assume we will win the election and set
2331 recoverymaster to be ourself on the current node
2333 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2334 CTDB_CURRENT_NODE, pnn);
2336 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2339 rec->recmaster = pnn;
2341 /* send an election message to all active nodes */
2342 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2343 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2347 this function will unban all nodes in the cluster
2349 static void unban_all_nodes(struct ctdb_context *ctdb)
2352 struct ctdb_node_map_old *nodemap;
2353 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2355 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2357 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2361 for (i=0;i<nodemap->num;i++) {
2362 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2363 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2364 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2365 nodemap->nodes[i].pnn, 0,
2368 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2373 talloc_free(tmp_ctx);
2378 we think we are winning the election - send a broadcast election request
2380 static void election_send_request(struct tevent_context *ev,
2381 struct tevent_timer *te,
2382 struct timeval t, void *p)
2384 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2387 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2389 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2392 TALLOC_FREE(rec->send_election_te);
2396 handler for memory dumps
2398 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2400 struct ctdb_recoverd *rec = talloc_get_type(
2401 private_data, struct ctdb_recoverd);
2402 struct ctdb_context *ctdb = rec->ctdb;
2403 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2406 struct ctdb_srvid_message *rd;
2408 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2409 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2410 talloc_free(tmp_ctx);
2413 rd = (struct ctdb_srvid_message *)data.dptr;
2415 dump = talloc_zero(tmp_ctx, TDB_DATA);
2417 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2418 talloc_free(tmp_ctx);
2421 ret = ctdb_dump_memory(ctdb, dump);
2423 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2424 talloc_free(tmp_ctx);
2428 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2430 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2432 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2433 talloc_free(tmp_ctx);
2437 talloc_free(tmp_ctx);
2441 handler for reload_nodes
2443 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2446 struct ctdb_recoverd *rec = talloc_get_type(
2447 private_data, struct ctdb_recoverd);
2449 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2451 ctdb_load_nodes_file(rec->ctdb);
2455 static void ctdb_rebalance_timeout(struct tevent_context *ev,
2456 struct tevent_timer *te,
2457 struct timeval t, void *p)
2459 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2461 if (rec->force_rebalance_nodes == NULL) {
2463 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2468 ("Rebalance timeout occurred - trigger takeover run\n"));
2469 rec->need_takeover_run = true;
2473 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2476 struct ctdb_recoverd *rec = talloc_get_type(
2477 private_data, struct ctdb_recoverd);
2478 struct ctdb_context *ctdb = rec->ctdb;
2482 uint32_t deferred_rebalance;
2484 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2488 if (data.dsize != sizeof(uint32_t)) {
2489 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2493 pnn = *(uint32_t *)&data.dptr[0];
2495 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2497 /* Copy any existing list of nodes. There's probably some
2498 * sort of realloc variant that will do this but we need to
2499 * make sure that freeing the old array also cancels the timer
2500 * event for the timeout... not sure if realloc will do that.
2502 len = (rec->force_rebalance_nodes != NULL) ?
2503 talloc_array_length(rec->force_rebalance_nodes) :
2506 /* This allows duplicates to be added but they don't cause
2507 * harm. A call to add a duplicate PNN arguably means that
2508 * the timeout should be reset, so this is the simplest
2511 t = talloc_zero_array(rec, uint32_t, len+1);
2512 CTDB_NO_MEMORY_VOID(ctdb, t);
2514 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2518 talloc_free(rec->force_rebalance_nodes);
2520 rec->force_rebalance_nodes = t;
2522 /* If configured, setup a deferred takeover run to make sure
2523 * that certain nodes get IPs rebalanced to them. This will
2524 * be cancelled if a successful takeover run happens before
2525 * the timeout. Assign tunable value to variable for
2528 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2529 if (deferred_rebalance != 0) {
2530 tevent_add_timer(ctdb->ev, rec->force_rebalance_nodes,
2531 timeval_current_ofs(deferred_rebalance, 0),
2532 ctdb_rebalance_timeout, rec);
2538 static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
2541 struct ctdb_recoverd *rec = talloc_get_type(
2542 private_data, struct ctdb_recoverd);
2543 struct ctdb_public_ip *ip;
2545 if (rec->recmaster != rec->ctdb->pnn) {
2546 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2550 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2551 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2555 ip = (struct ctdb_public_ip *)data.dptr;
2557 update_ip_assignment_tree(rec->ctdb, ip);
2560 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2562 struct ctdb_op_state *op_state)
2564 struct ctdb_disable_message *r;
2569 /* Validate input data */
2570 if (data.dsize != sizeof(struct ctdb_disable_message)) {
2571 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2572 "expecting %lu\n", (long unsigned)data.dsize,
2573 (long unsigned)sizeof(struct ctdb_srvid_message)));
2576 if (data.dptr == NULL) {
2577 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2581 r = (struct ctdb_disable_message *)data.dptr;
2582 timeout = r->timeout;
2584 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2589 /* Returning our PNN tells the caller that we succeeded */
2590 ret = ctdb_get_pnn(ctdb);
2592 result.dsize = sizeof(int32_t);
2593 result.dptr = (uint8_t *)&ret;
2594 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2597 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2600 struct ctdb_recoverd *rec = talloc_get_type(
2601 private_data, struct ctdb_recoverd);
2603 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2606 /* Backward compatibility for this SRVID */
2607 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2610 struct ctdb_recoverd *rec = talloc_get_type(
2611 private_data, struct ctdb_recoverd);
2614 if (data.dsize != sizeof(uint32_t)) {
2615 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2616 "expecting %lu\n", (long unsigned)data.dsize,
2617 (long unsigned)sizeof(uint32_t)));
2620 if (data.dptr == NULL) {
2621 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2625 timeout = *((uint32_t *)data.dptr);
2627 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2630 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2633 struct ctdb_recoverd *rec = talloc_get_type(
2634 private_data, struct ctdb_recoverd);
2636 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2640 handler for ip reallocate, just add it to the list of requests and
2641 handle this later in the monitor_cluster loop so we do not recurse
2642 with other requests to takeover_run()
2644 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2647 struct ctdb_srvid_message *request;
2648 struct ctdb_recoverd *rec = talloc_get_type(
2649 private_data, struct ctdb_recoverd);
2651 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2652 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2656 request = (struct ctdb_srvid_message *)data.dptr;
2658 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2661 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2662 struct ctdb_recoverd *rec)
2666 struct srvid_requests *current;
2668 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2670 /* Only process requests that are currently pending. More
2671 * might come in while the takeover run is in progress and
2672 * they will need to be processed later since they might
2673 * be in response flag changes.
2675 current = rec->reallocate_requests;
2676 rec->reallocate_requests = NULL;
2678 if (do_takeover_run(rec, rec->nodemap, false)) {
2679 ret = ctdb_get_pnn(ctdb);
2684 result.dsize = sizeof(int32_t);
2685 result.dptr = (uint8_t *)&ret;
2687 srvid_requests_reply(ctdb, ¤t, result);
2692 handler for recovery master elections
2694 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2696 struct ctdb_recoverd *rec = talloc_get_type(
2697 private_data, struct ctdb_recoverd);
2698 struct ctdb_context *ctdb = rec->ctdb;
2700 struct election_message *em = (struct election_message *)data.dptr;
2702 /* Ignore election packets from ourself */
2703 if (ctdb->pnn == em->pnn) {
2707 /* we got an election packet - update the timeout for the election */
2708 talloc_free(rec->election_timeout);
2709 rec->election_timeout = tevent_add_timer(
2712 timeval_current_ofs(0, 500000) :
2713 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2714 ctdb_election_timeout, rec);
2716 /* someone called an election. check their election data
2717 and if we disagree and we would rather be the elected node,
2718 send a new election message to all other nodes
2720 if (ctdb_election_win(rec, em)) {
2721 if (!rec->send_election_te) {
2722 rec->send_election_te = tevent_add_timer(
2724 timeval_current_ofs(0, 500000),
2725 election_send_request, rec);
2727 /*unban_all_nodes(ctdb);*/
2732 TALLOC_FREE(rec->send_election_te);
2734 /* Release the recovery lock file */
2735 if (ctdb_recovery_have_lock(ctdb)) {
2736 ctdb_recovery_unlock(ctdb);
2737 unban_all_nodes(ctdb);
2740 clear_ip_assignment_tree(ctdb);
2742 /* ok, let that guy become recmaster then */
2743 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2744 CTDB_CURRENT_NODE, em->pnn);
2746 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2749 rec->recmaster = em->pnn;
2756 force the start of the election process
2758 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2759 struct ctdb_node_map_old *nodemap)
2762 struct ctdb_context *ctdb = rec->ctdb;
2764 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2766 /* set all nodes to recovery mode to stop all internode traffic */
2767 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2769 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2773 talloc_free(rec->election_timeout);
2774 rec->election_timeout = tevent_add_timer(
2777 timeval_current_ofs(0, 500000) :
2778 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2779 ctdb_election_timeout, rec);
2781 ret = send_election_request(rec, pnn);
2783 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2787 /* wait for a few seconds to collect all responses */
2788 ctdb_wait_election(rec);
2794 handler for when a node changes its flags
2796 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2798 struct ctdb_recoverd *rec = talloc_get_type(
2799 private_data, struct ctdb_recoverd);
2800 struct ctdb_context *ctdb = rec->ctdb;
2802 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2803 struct ctdb_node_map_old *nodemap=NULL;
2804 TALLOC_CTX *tmp_ctx;
2806 int disabled_flag_changed;
2808 if (data.dsize != sizeof(*c)) {
2809 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2813 tmp_ctx = talloc_new(ctdb);
2814 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2816 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2818 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2819 talloc_free(tmp_ctx);
2824 for (i=0;i<nodemap->num;i++) {
2825 if (nodemap->nodes[i].pnn == c->pnn) break;
2828 if (i == nodemap->num) {
2829 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2830 talloc_free(tmp_ctx);
2834 if (c->old_flags != c->new_flags) {
2835 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2838 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2840 nodemap->nodes[i].flags = c->new_flags;
2842 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2843 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2846 rec->recmaster == ctdb->pnn &&
2847 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2848 /* Only do the takeover run if the perm disabled or unhealthy
2849 flags changed since these will cause an ip failover but not
2851 If the node became disconnected or banned this will also
2852 lead to an ip address failover but that is handled
2855 if (disabled_flag_changed) {
2856 rec->need_takeover_run = true;
2860 talloc_free(tmp_ctx);
2864 handler for when we need to push out flag changes ot all other nodes
2866 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2869 struct ctdb_recoverd *rec = talloc_get_type(
2870 private_data, struct ctdb_recoverd);
2871 struct ctdb_context *ctdb = rec->ctdb;
2873 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2874 struct ctdb_node_map_old *nodemap=NULL;
2875 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2878 /* read the node flags from the recmaster */
2879 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2882 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2883 talloc_free(tmp_ctx);
2886 if (c->pnn >= nodemap->num) {
2887 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2888 talloc_free(tmp_ctx);
2892 /* send the flags update to all connected nodes */
2893 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2895 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2896 nodes, 0, CONTROL_TIMEOUT(),
2900 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2902 talloc_free(tmp_ctx);
2906 talloc_free(tmp_ctx);
2910 struct verify_recmode_normal_data {
2912 enum monitor_result status;
2915 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2917 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2920 /* one more node has responded with recmode data*/
2923 /* if we failed to get the recmode, then return an error and let
2924 the main loop try again.
2926 if (state->state != CTDB_CONTROL_DONE) {
2927 if (rmdata->status == MONITOR_OK) {
2928 rmdata->status = MONITOR_FAILED;
2933 /* if we got a response, then the recmode will be stored in the
2936 if (state->status != CTDB_RECOVERY_NORMAL) {
2937 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2938 rmdata->status = MONITOR_RECOVERY_NEEDED;
2945 /* verify that all nodes are in normal recovery mode */
2946 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2948 struct verify_recmode_normal_data *rmdata;
2949 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2950 struct ctdb_client_control_state *state;
2951 enum monitor_result status;
2954 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2955 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2957 rmdata->status = MONITOR_OK;
2959 /* loop over all active nodes and send an async getrecmode call to
2961 for (j=0; j<nodemap->num; j++) {
2962 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2965 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2967 nodemap->nodes[j].pnn);
2968 if (state == NULL) {
2969 /* we failed to send the control, treat this as
2970 an error and try again next iteration
2972 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2973 talloc_free(mem_ctx);
2974 return MONITOR_FAILED;
2977 /* set up the callback functions */
2978 state->async.fn = verify_recmode_normal_callback;
2979 state->async.private_data = rmdata;
2981 /* one more control to wait for to complete */
2986 /* now wait for up to the maximum number of seconds allowed
2987 or until all nodes we expect a response from has replied
2989 while (rmdata->count > 0) {
2990 tevent_loop_once(ctdb->ev);
2993 status = rmdata->status;
2994 talloc_free(mem_ctx);
2999 struct verify_recmaster_data {
3000 struct ctdb_recoverd *rec;
3003 enum monitor_result status;
3006 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3008 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3011 /* one more node has responded with recmaster data*/
3014 /* if we failed to get the recmaster, then return an error and let
3015 the main loop try again.
3017 if (state->state != CTDB_CONTROL_DONE) {
3018 if (rmdata->status == MONITOR_OK) {
3019 rmdata->status = MONITOR_FAILED;
3024 /* if we got a response, then the recmaster will be stored in the
3027 if (state->status != rmdata->pnn) {
3028 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3029 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3030 rmdata->status = MONITOR_ELECTION_NEEDED;
3037 /* verify that all nodes agree that we are the recmaster */
3038 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
3040 struct ctdb_context *ctdb = rec->ctdb;
3041 struct verify_recmaster_data *rmdata;
3042 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3043 struct ctdb_client_control_state *state;
3044 enum monitor_result status;
3047 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3048 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3052 rmdata->status = MONITOR_OK;
3054 /* loop over all active nodes and send an async getrecmaster call to
3056 for (j=0; j<nodemap->num; j++) {
3057 if (nodemap->nodes[j].pnn == rec->recmaster) {
3060 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3063 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3065 nodemap->nodes[j].pnn);
3066 if (state == NULL) {
3067 /* we failed to send the control, treat this as
3068 an error and try again next iteration
3070 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3071 talloc_free(mem_ctx);
3072 return MONITOR_FAILED;
3075 /* set up the callback functions */
3076 state->async.fn = verify_recmaster_callback;
3077 state->async.private_data = rmdata;
3079 /* one more control to wait for to complete */
3084 /* now wait for up to the maximum number of seconds allowed
3085 or until all nodes we expect a response from has replied
3087 while (rmdata->count > 0) {
3088 tevent_loop_once(ctdb->ev);
3091 status = rmdata->status;
3092 talloc_free(mem_ctx);
3096 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3097 struct ctdb_recoverd *rec)
3099 struct ctdb_iface_list_old *ifaces = NULL;
3100 TALLOC_CTX *mem_ctx;
3103 mem_ctx = talloc_new(NULL);
3105 /* Read the interfaces from the local node */
3106 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3107 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3108 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3109 /* We could return an error. However, this will be
3110 * rare so we'll decide that the interfaces have
3111 * actually changed, just in case.
3113 talloc_free(mem_ctx);
3118 /* We haven't been here before so things have changed */
3119 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3121 } else if (rec->ifaces->num != ifaces->num) {
3122 /* Number of interfaces has changed */
3123 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3124 rec->ifaces->num, ifaces->num));
3127 /* See if interface names or link states have changed */
3129 for (i = 0; i < rec->ifaces->num; i++) {
3130 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3131 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3133 ("Interface in slot %d changed: %s => %s\n",
3134 i, iface->name, ifaces->ifaces[i].name));
3138 if (iface->link_state != ifaces->ifaces[i].link_state) {
3140 ("Interface %s changed state: %d => %d\n",
3141 iface->name, iface->link_state,
3142 ifaces->ifaces[i].link_state));
3149 talloc_free(rec->ifaces);
3150 rec->ifaces = talloc_steal(rec, ifaces);
3152 talloc_free(mem_ctx);
3156 /* called to check that the local allocation of public ip addresses is ok.
3158 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map_old *nodemap)
3160 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3162 bool need_takeover_run = false;
3164 if (interfaces_have_changed(ctdb, rec)) {
3165 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3166 "local node %u - force takeover run\n",
3168 need_takeover_run = true;
3171 /* verify that we have the ip addresses we should have
3172 and we don't have ones we shouldnt have.
3173 if we find an inconsistency we set recmode to
3174 active on the local node and wait for the recmaster
3175 to do a full blown recovery.
3176 also if the pnn is -1 and we are healthy and can host the ip
3177 we also request a ip reallocation.
3179 if (ctdb->tunable.disable_ip_failover == 0) {
3180 struct ctdb_public_ip_list_old *ips = NULL;
3182 /* read the *available* IPs from the local node */
3183 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3185 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3186 talloc_free(mem_ctx);
3190 for (j=0; j<ips->num; j++) {
3191 if (ips->ips[j].pnn == -1 &&
3192 nodemap->nodes[pnn].flags == 0) {
3193 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3194 ctdb_addr_to_str(&ips->ips[j].addr)));
3195 need_takeover_run = true;
3201 /* read the *known* IPs from the local node */
3202 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3204 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3205 talloc_free(mem_ctx);
3209 for (j=0; j<ips->num; j++) {
3210 if (ips->ips[j].pnn == pnn) {
3211 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3212 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3213 ctdb_addr_to_str(&ips->ips[j].addr)));
3214 need_takeover_run = true;
3217 if (ctdb->do_checkpublicip &&
3218 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3220 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3221 ctdb_addr_to_str(&ips->ips[j].addr)));
3223 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3224 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3231 if (need_takeover_run) {
3232 struct ctdb_srvid_message rd;
3235 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3239 data.dptr = (uint8_t *)&rd;
3240 data.dsize = sizeof(rd);
3242 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3244 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3247 talloc_free(mem_ctx);
3252 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3254 struct ctdb_node_map_old **remote_nodemaps = callback_data;
3256 if (node_pnn >= ctdb->num_nodes) {
3257 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3261 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3265 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3266 struct ctdb_node_map_old *nodemap,
3267 struct ctdb_node_map_old **remote_nodemaps)
3271 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3272 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3274 CONTROL_TIMEOUT(), false, tdb_null,
3275 async_getnodemap_callback,
3277 remote_nodemaps) != 0) {
3278 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3286 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3288 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3289 const char *reclockfile;
3291 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3292 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3293 talloc_free(tmp_ctx);
3297 if (reclockfile == NULL) {
3298 if (ctdb->recovery_lock_file != NULL) {
3299 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3300 talloc_free(ctdb->recovery_lock_file);
3301 ctdb->recovery_lock_file = NULL;
3302 ctdb_recovery_unlock(ctdb);
3304 talloc_free(tmp_ctx);
3308 if (ctdb->recovery_lock_file == NULL) {
3310 ("Recovery lock file enabled (%s)\n", reclockfile));
3311 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3312 ctdb_recovery_unlock(ctdb);
3313 talloc_free(tmp_ctx);
3318 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3319 talloc_free(tmp_ctx);
3324 ("Recovery lock file changed (now %s)\n", reclockfile));
3325 talloc_free(ctdb->recovery_lock_file);
3326 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3327 ctdb_recovery_unlock(ctdb);
3329 talloc_free(tmp_ctx);
3333 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3334 TALLOC_CTX *mem_ctx)
3337 struct ctdb_node_map_old *nodemap=NULL;
3338 struct ctdb_node_map_old *recmaster_nodemap=NULL;
3339 struct ctdb_node_map_old **remote_nodemaps=NULL;
3340 struct ctdb_vnn_map *vnnmap=NULL;
3341 struct ctdb_vnn_map *remote_vnnmap=NULL;
3342 uint32_t num_lmasters;
3343 int32_t debug_level;
3348 /* verify that the main daemon is still running */
3349 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3350 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3354 /* ping the local daemon to tell it we are alive */
3355 ctdb_ctrl_recd_ping(ctdb);
3357 if (rec->election_timeout) {
3358 /* an election is in progress */
3362 /* read the debug level from the parent and update locally */
3363 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3365 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3368 DEBUGLEVEL = debug_level;
3370 /* get relevant tunables */
3371 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3373 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3378 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3379 CTDB_CURRENT_NODE, &ctdb->runstate);
3381 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3385 /* get the current recovery lock file from the server */
3386 if (update_recovery_lock_file(ctdb) != 0) {
3387 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3391 pnn = ctdb_get_pnn(ctdb);
3394 TALLOC_FREE(rec->nodemap);
3395 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3397 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3400 nodemap = rec->nodemap;
3402 /* remember our own node flags */
3403 rec->node_flags = nodemap->nodes[pnn].flags;
3405 ban_misbehaving_nodes(rec, &self_ban);
3407 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3411 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3412 also frozen and that the recmode is set to active.
3414 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3415 /* If this node has become inactive then we want to
3416 * reduce the chances of it taking over the recovery
3417 * master role when it becomes active again. This
3418 * helps to stabilise the recovery master role so that
3419 * it stays on the most stable node.
3421 rec->priority_time = timeval_current();
3423 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3425 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3427 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3428 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3430 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3432 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3436 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3438 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3443 /* If this node is stopped or banned then it is not the recovery
3444 * master, so don't do anything. This prevents stopped or banned
3445 * node from starting election and sending unnecessary controls.
3450 /* If we are not the recmaster then do some housekeeping */
3451 if (rec->recmaster != pnn) {
3452 /* Ignore any IP reallocate requests - only recmaster
3455 TALLOC_FREE(rec->reallocate_requests);
3456 /* Clear any nodes that should be force rebalanced in
3457 * the next takeover run. If the recovery master role
3458 * has moved then we don't want to process these some
3459 * time in the future.
3461 TALLOC_FREE(rec->force_rebalance_nodes);
3464 /* Retrieve capabilities from all connected nodes */
3465 ret = update_capabilities(rec, nodemap);
3467 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3471 /* When recovery daemon is started, recmaster is set to
3472 * "unknown" so it knows to start an election.
3474 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3475 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3476 force_election(rec, pnn, nodemap);
3481 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3482 * but we have, then force an election and try to become the new
3485 if (!ctdb_node_has_capabilities(rec->caps,
3487 CTDB_CAP_RECMASTER) &&
3488 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3489 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3490 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3491 " but we (node %u) have - force an election\n",
3492 rec->recmaster, pnn));
3493 force_election(rec, pnn, nodemap);
3497 /* Verify that the master node has not been deleted. This
3498 * should not happen because a node should always be shutdown
3499 * before being deleted, causing a new master to be elected
3500 * before now. However, if something strange has happened
3501 * then checking here will ensure we don't index beyond the
3502 * end of the nodemap array. */
3503 if (rec->recmaster >= nodemap->num) {
3505 ("Recmaster node %u has been deleted. Force election\n",
3507 force_election(rec, pnn, nodemap);
3511 /* if recovery master is disconnected/deleted we must elect a new recmaster */
3512 if (nodemap->nodes[rec->recmaster].flags &
3513 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
3515 ("Recmaster node %u is disconnected/deleted. Force election\n",
3517 force_election(rec, pnn, nodemap);
3521 /* get nodemap from the recovery master to check if it is inactive */
3522 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3523 mem_ctx, &recmaster_nodemap);
3527 " Unable to get nodemap from recovery master %u\n",
3533 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3534 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3536 ("Recmaster node %u is inactive. Force election\n",
3539 * update our nodemap to carry the recmaster's notion of
3540 * its own flags, so that we don't keep freezing the
3541 * inactive recmaster node...
3543 nodemap->nodes[rec->recmaster].flags =
3544 recmaster_nodemap->nodes[rec->recmaster].flags;
3545 force_election(rec, pnn, nodemap);
3549 /* verify that we have all ip addresses we should have and we dont
3550 * have addresses we shouldnt have.
3552 if (ctdb->tunable.disable_ip_failover == 0 &&
3553 !ctdb_op_is_disabled(rec->takeover_run)) {
3554 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3555 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3560 /* if we are not the recmaster then we do not need to check
3561 if recovery is needed
3563 if (pnn != rec->recmaster) {
3568 /* ensure our local copies of flags are right */
3569 ret = update_local_flags(rec, nodemap);
3570 if (ret == MONITOR_ELECTION_NEEDED) {
3571 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3572 force_election(rec, pnn, nodemap);
3575 if (ret != MONITOR_OK) {
3576 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3580 if (ctdb->num_nodes != nodemap->num) {
3581 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3582 ctdb_load_nodes_file(ctdb);
3586 /* verify that all active nodes agree that we are the recmaster */
3587 switch (verify_recmaster(rec, nodemap, pnn)) {
3588 case MONITOR_RECOVERY_NEEDED:
3589 /* can not happen */
3591 case MONITOR_ELECTION_NEEDED:
3592 force_election(rec, pnn, nodemap);
3596 case MONITOR_FAILED:
3601 /* get the vnnmap */
3602 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3604 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3608 if (rec->need_recovery) {
3609 /* a previous recovery didn't finish */
3610 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3614 /* verify that all active nodes are in normal mode
3615 and not in recovery mode
3617 switch (verify_recmode(ctdb, nodemap)) {
3618 case MONITOR_RECOVERY_NEEDED:
3619 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3621 case MONITOR_FAILED:
3623 case MONITOR_ELECTION_NEEDED:
3624 /* can not happen */
3630 if (ctdb->recovery_lock_file != NULL) {
3631 /* We must already hold the recovery lock */
3632 if (!ctdb_recovery_have_lock(ctdb)) {
3633 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3634 ctdb_set_culprit(rec, ctdb->pnn);
3635 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3641 /* if there are takeovers requested, perform it and notify the waiters */
3642 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3643 rec->reallocate_requests) {
3644 process_ipreallocate_requests(ctdb, rec);
3647 /* If recoveries are disabled then there is no use doing any
3648 * nodemap or flags checks. Recoveries might be disabled due
3649 * to "reloadnodes", so doing these checks might cause an
3650 * unnecessary recovery. */
3651 if (ctdb_op_is_disabled(rec->recovery)) {
3655 /* get the nodemap for all active remote nodes
3657 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3658 if (remote_nodemaps == NULL) {
3659 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3662 for(i=0; i<nodemap->num; i++) {
3663 remote_nodemaps[i] = NULL;
3665 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3666 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3670 /* verify that all other nodes have the same nodemap as we have
3672 for (j=0; j<nodemap->num; j++) {
3673 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3677 if (remote_nodemaps[j] == NULL) {
3678 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3679 ctdb_set_culprit(rec, j);
3684 /* if the nodes disagree on how many nodes there are
3685 then this is a good reason to try recovery
3687 if (remote_nodemaps[j]->num != nodemap->num) {
3688 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3689 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3690 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3691 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3695 /* if the nodes disagree on which nodes exist and are
3696 active, then that is also a good reason to do recovery
3698 for (i=0;i<nodemap->num;i++) {
3699 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3700 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3701 nodemap->nodes[j].pnn, i,
3702 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3703 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3704 do_recovery(rec, mem_ctx, pnn, nodemap,
3712 * Update node flags obtained from each active node. This ensure we have
3713 * up-to-date information for all the nodes.
3715 for (j=0; j<nodemap->num; j++) {
3716 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3719 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3722 for (j=0; j<nodemap->num; j++) {
3723 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3727 /* verify the flags are consistent
3729 for (i=0; i<nodemap->num; i++) {
3730 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3734 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3735 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3736 nodemap->nodes[j].pnn,
3737 nodemap->nodes[i].pnn,
3738 remote_nodemaps[j]->nodes[i].flags,
3739 nodemap->nodes[i].flags));
3741 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3742 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3743 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3744 do_recovery(rec, mem_ctx, pnn, nodemap,
3748 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3749 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3750 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3751 do_recovery(rec, mem_ctx, pnn, nodemap,
3760 /* count how many active nodes there are */
3762 for (i=0; i<nodemap->num; i++) {
3763 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3764 if (ctdb_node_has_capabilities(rec->caps,
3765 ctdb->nodes[i]->pnn,
3766 CTDB_CAP_LMASTER)) {
3773 /* There must be the same number of lmasters in the vnn map as
3774 * there are active nodes with the lmaster capability... or
3777 if (vnnmap->size != num_lmasters) {
3778 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3779 vnnmap->size, num_lmasters));
3780 ctdb_set_culprit(rec, ctdb->pnn);
3781 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3785 /* verify that all active nodes in the nodemap also exist in
3788 for (j=0; j<nodemap->num; j++) {
3789 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3792 if (nodemap->nodes[j].pnn == pnn) {
3796 for (i=0; i<vnnmap->size; i++) {
3797 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3801 if (i == vnnmap->size) {
3802 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3803 nodemap->nodes[j].pnn));
3804 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3805 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3811 /* verify that all other nodes have the same vnnmap
3812 and are from the same generation
3814 for (j=0; j<nodemap->num; j++) {
3815 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3818 if (nodemap->nodes[j].pnn == pnn) {
3822 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3823 mem_ctx, &remote_vnnmap);
3825 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3826 nodemap->nodes[j].pnn));
3830 /* verify the vnnmap generation is the same */
3831 if (vnnmap->generation != remote_vnnmap->generation) {
3832 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3833 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3834 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3835 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3839 /* verify the vnnmap size is the same */
3840 if (vnnmap->size != remote_vnnmap->size) {
3841 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3842 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3843 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3844 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3848 /* verify the vnnmap is the same */
3849 for (i=0;i<vnnmap->size;i++) {
3850 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3851 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3852 nodemap->nodes[j].pnn));
3853 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3854 do_recovery(rec, mem_ctx, pnn, nodemap,
3861 /* we might need to change who has what IP assigned */
3862 if (rec->need_takeover_run) {
3863 /* If takeover run fails, then the offending nodes are
3864 * assigned ban culprit counts. And we re-try takeover.
3865 * If takeover run fails repeatedly, the node would get
3868 do_takeover_run(rec, nodemap, true);
3873 the main monitoring loop
3875 static void monitor_cluster(struct ctdb_context *ctdb)
3877 struct ctdb_recoverd *rec;
3879 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3881 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3882 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3885 rec->recmaster = CTDB_UNKNOWN_PNN;
3887 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3888 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3890 rec->recovery = ctdb_op_init(rec, "recoveries");
3891 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3893 rec->priority_time = timeval_current();
3895 /* register a message port for sending memory dumps */
3896 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3898 /* register a message port for recovery elections */
3899 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3901 /* when nodes are disabled/enabled */
3902 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3904 /* when we are asked to puch out a flag change */
3905 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3907 /* register a message port for vacuum fetch */
3908 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3910 /* register a message port for reloadnodes */
3911 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3913 /* register a message port for performing a takeover run */
3914 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3916 /* register a message port for disabling the ip check for a short while */
3917 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3919 /* register a message port for updating the recovery daemons node assignment for an ip */
3920 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3922 /* register a message port for forcing a rebalance of a node next
3924 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3926 /* Register a message port for disabling takeover runs */
3927 ctdb_client_set_message_handler(ctdb,
3928 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3929 disable_takeover_runs_handler, rec);
3931 /* Register a message port for disabling recoveries */
3932 ctdb_client_set_message_handler(ctdb,
3933 CTDB_SRVID_DISABLE_RECOVERIES,
3934 disable_recoveries_handler, rec);
3936 /* register a message port for detaching database */
3937 ctdb_client_set_message_handler(ctdb,
3938 CTDB_SRVID_DETACH_DATABASE,
3939 detach_database_handler, rec);
3942 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3943 struct timeval start;
3947 DEBUG(DEBUG_CRIT,(__location__
3948 " Failed to create temp context\n"));
3952 start = timeval_current();
3953 main_loop(ctdb, rec, mem_ctx);
3954 talloc_free(mem_ctx);
3956 /* we only check for recovery once every second */
3957 elapsed = timeval_elapsed(&start);
3958 if (elapsed < ctdb->tunable.recover_interval) {
3959 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3966 event handler for when the main ctdbd dies
3968 static void ctdb_recoverd_parent(struct tevent_context *ev,
3969 struct tevent_fd *fde,
3970 uint16_t flags, void *private_data)
3972 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3977 called regularly to verify that the recovery daemon is still running
3979 static void ctdb_check_recd(struct tevent_context *ev,
3980 struct tevent_timer *te,
3981 struct timeval yt, void *p)
3983 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3985 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3986 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3988 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3989 ctdb_restart_recd, ctdb);
3994 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3995 timeval_current_ofs(30, 0),
3996 ctdb_check_recd, ctdb);
3999 static void recd_sig_child_handler(struct tevent_context *ev,
4000 struct tevent_signal *se, int signum,
4001 int count, void *dont_care,
4004 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4009 pid = waitpid(-1, &status, WNOHANG);
4011 if (errno != ECHILD) {
4012 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4017 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4023 startup the recovery daemon as a child of the main ctdb daemon
4025 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4028 struct tevent_signal *se;
4029 struct tevent_fd *fde;
4031 if (pipe(fd) != 0) {
4035 ctdb->recoverd_pid = ctdb_fork(ctdb);
4036 if (ctdb->recoverd_pid == -1) {
4040 if (ctdb->recoverd_pid != 0) {
4041 talloc_free(ctdb->recd_ctx);
4042 ctdb->recd_ctx = talloc_new(ctdb);
4043 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4046 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4047 timeval_current_ofs(30, 0),
4048 ctdb_check_recd, ctdb);
4054 srandom(getpid() ^ time(NULL));
4056 ctdb_set_process_name("ctdb_recovered");
4057 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4058 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4062 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4064 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4065 ctdb_recoverd_parent, &fd[0]);
4066 tevent_fd_set_auto_close(fde);
4068 /* set up a handler to pick up sigchld */
4069 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4070 recd_sig_child_handler, ctdb);
4072 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4076 monitor_cluster(ctdb);
4078 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4083 shutdown the recovery daemon
4085 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4087 if (ctdb->recoverd_pid == 0) {
4091 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4092 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4094 TALLOC_FREE(ctdb->recd_ctx);
4095 TALLOC_FREE(ctdb->recd_ping_count);
4098 static void ctdb_restart_recd(struct tevent_context *ev,
4099 struct tevent_timer *te,
4100 struct timeval t, void *private_data)
4102 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4104 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4105 ctdb_stop_recoverd(ctdb);
4106 ctdb_start_recoverd(ctdb);