server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "lib/events/events.h"
  22 #include "system/filesys.h"
  23 #include "system/time.h"
  24 #include "system/network.h"
  25 #include "system/wait.h"
  26 #include "popt.h"
  27 #include "cmdline.h"
  28 #include "../include/ctdb.h"
  29 #include "../include/ctdb_private.h"
  30 #include "db_wrap.h"
  31 #include "dlinklist.h"
  32
  33
  34 /* list of "ctdb ipreallocate" processes to call back when we have
  35    finished the takeover run.
  36 */
  37 struct ip_reallocate_list {
  38         struct ip_reallocate_list *next;
  39         struct rd_memdump_reply *rd;
  40 };
  41
  42 struct ctdb_banning_state {
  43         uint32_t count;
  44         struct timeval last_reported_time;
  45 };
  46
  47 /*
  48   private state of recovery daemon
  49  */
  50 struct ctdb_recoverd {
  51         struct ctdb_context *ctdb;
  52         uint32_t recmaster;
  53         uint32_t num_active;
  54         uint32_t num_connected;
  55         uint32_t last_culprit_node;
  56         struct ctdb_node_map *nodemap;
  57         struct timeval priority_time;
  58         bool need_takeover_run;
  59         bool need_recovery;
  60         uint32_t node_flags;
  61         struct timed_event *send_election_te;
  62         struct timed_event *election_timeout;
  63         struct vacuum_info *vacuum_info;
  64         TALLOC_CTX *ip_reallocate_ctx;
  65         struct ip_reallocate_list *reallocate_callers;
  66         TALLOC_CTX *ip_check_disable_ctx;
  67 };
  68
  69 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
  70 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
  71
  72
  73 /*
  74   ban a node for a period of time
  75  */
  76 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
  77 {
  78         int ret;
  79         struct ctdb_context *ctdb = rec->ctdb;
  80         struct ctdb_ban_time bantime;
  81
  82         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
  83
  84         if (!ctdb_validate_pnn(ctdb, pnn)) {
  85                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
  86                 return;
  87         }
  88
  89         bantime.pnn  = pnn;
  90         bantime.time = ban_time;
  91
  92         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
  93         if (ret != 0) {
  94                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
  95                 return;
  96         }
  97
  98 }
  99
 100 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 101
 102
 103 /*
 104   run the "recovered" eventscript on all nodes
 105  */
 106 static int run_recovered_eventscript(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, const char *caller)
 107 {
 108         TALLOC_CTX *tmp_ctx;
 109         uint32_t *nodes;
 110
 111         tmp_ctx = talloc_new(ctdb);
 112         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 113
 114         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 115         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 116                                         nodes, 0,
 117                                         CONTROL_TIMEOUT(), false, tdb_null,
 118                                         NULL, NULL,
 119                                         NULL) != 0) {
 120                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 121
 122                 talloc_free(tmp_ctx);
 123                 return -1;
 124         }
 125
 126         talloc_free(tmp_ctx);
 127         return 0;
 128 }
 129
 130 /*
 131   remember the trouble maker
 132  */
 133 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 134 {
 135         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 136         struct ctdb_banning_state *ban_state;
 137
 138         if (culprit > ctdb->num_nodes) {
 139                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 140                 return;
 141         }
 142
 143         if (ctdb->nodes[culprit]->ban_state == NULL) {
 144                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 145                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 146
 147
 148         }
 149         ban_state = ctdb->nodes[culprit]->ban_state;
 150         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 151                 /* this was the first time in a long while this node
 152                    misbehaved so we will forgive any old transgressions.
 153                 */
 154                 ban_state->count = 0;
 155         }
 156
 157         ban_state->count += count;
 158         ban_state->last_reported_time = timeval_current();
 159         rec->last_culprit_node = culprit;
 160 }
 161
 162 /*
 163   remember the trouble maker
 164  */
 165 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 166 {
 167         ctdb_set_culprit_count(rec, culprit, 1);
 168 }
 169
 170
 171 /* this callback is called for every node that failed to execute the
 172    start recovery event
 173 */
 174 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 175 {
 176         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 177
 178         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 179
 180         ctdb_set_culprit(rec, node_pnn);
 181 }
 182
 183 /*
 184   run the "startrecovery" eventscript on all nodes
 185  */
 186 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 187 {
 188         TALLOC_CTX *tmp_ctx;
 189         uint32_t *nodes;
 190         struct ctdb_context *ctdb = rec->ctdb;
 191
 192         tmp_ctx = talloc_new(ctdb);
 193         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 194
 195         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 196         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 197                                         nodes, 0,
 198                                         CONTROL_TIMEOUT(), false, tdb_null,
 199                                         NULL,
 200                                         startrecovery_fail_callback,
 201                                         rec) != 0) {
 202                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 203                 talloc_free(tmp_ctx);
 204                 return -1;
 205         }
 206
 207         talloc_free(tmp_ctx);
 208         return 0;
 209 }
 210
 211 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 212 {
 213         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
 214                 DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
 215                 return;
 216         }
 217         if (node_pnn < ctdb->num_nodes) {
 218                 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
 219         }
 220 }
 221
 222 /*
 223   update the node capabilities for all connected nodes
 224  */
 225 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 226 {
 227         uint32_t *nodes;
 228         TALLOC_CTX *tmp_ctx;
 229
 230         tmp_ctx = talloc_new(ctdb);
 231         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 232
 233         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 234         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
 235                                         nodes, 0,
 236                                         CONTROL_TIMEOUT(),
 237                                         false, tdb_null,
 238                                         async_getcap_callback, NULL,
 239                                         NULL) != 0) {
 240                 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
 241                 talloc_free(tmp_ctx);
 242                 return -1;
 243         }
 244
 245         talloc_free(tmp_ctx);
 246         return 0;
 247 }
 248
 249 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 250 {
 251         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 252
 253         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 254         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 255 }
 256
 257 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 258 {
 259         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 260
 261         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 262         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 263 }
 264
 265 /*
 266   change recovery mode on all nodes
 267  */
 268 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 269 {
 270         TDB_DATA data;
 271         uint32_t *nodes;
 272         TALLOC_CTX *tmp_ctx;
 273
 274         tmp_ctx = talloc_new(ctdb);
 275         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 276
 277         /* freeze all nodes */
 278         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 279         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 280                 int i;
 281
 282                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 283                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 284                                                 nodes, i,
 285                                                 CONTROL_TIMEOUT(),
 286                                                 false, tdb_null,
 287                                                 NULL,
 288                                                 set_recmode_fail_callback,
 289                                                 rec) != 0) {
 290                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 291                                 talloc_free(tmp_ctx);
 292                                 return -1;
 293                         }
 294                 }
 295         }
 296
 297
 298         data.dsize = sizeof(uint32_t);
 299         data.dptr = (unsigned char *)&rec_mode;
 300
 301         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 302                                         nodes, 0,
 303                                         CONTROL_TIMEOUT(),
 304                                         false, data,
 305                                         NULL, NULL,
 306                                         NULL) != 0) {
 307                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 308                 talloc_free(tmp_ctx);
 309                 return -1;
 310         }
 311
 312         talloc_free(tmp_ctx);
 313         return 0;
 314 }
 315
 316 /*
 317   change recovery master on all node
 318  */
 319 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 320 {
 321         TDB_DATA data;
 322         TALLOC_CTX *tmp_ctx;
 323         uint32_t *nodes;
 324
 325         tmp_ctx = talloc_new(ctdb);
 326         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 327
 328         data.dsize = sizeof(uint32_t);
 329         data.dptr = (unsigned char *)&pnn;
 330
 331         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 332         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 333                                         nodes, 0,
 334                                         CONTROL_TIMEOUT(), false, data,
 335                                         NULL, NULL,
 336                                         NULL) != 0) {
 337                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 338                 talloc_free(tmp_ctx);
 339                 return -1;
 340         }
 341
 342         talloc_free(tmp_ctx);
 343         return 0;
 344 }
 345
 346 /* update all remote nodes to use the same db priority that we have
 347    this can fail if the remove node has not yet been upgraded to
 348    support this function, so we always return success and never fail
 349    a recovery if this call fails.
 350 */
 351 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 352         struct ctdb_node_map *nodemap,
 353         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 354 {
 355         int db;
 356         uint32_t *nodes;
 357
 358         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
 359
 360         /* step through all local databases */
 361         for (db=0; db<dbmap->num;db++) {
 362                 TDB_DATA data;
 363                 struct ctdb_db_priority db_prio;
 364                 int ret;
 365
 366                 db_prio.db_id     = dbmap->dbs[db].dbid;
 367                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 368                 if (ret != 0) {
 369                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 370                         continue;
 371                 }
 372
 373                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 374
 375                 data.dptr  = (uint8_t *)&db_prio;
 376                 data.dsize = sizeof(db_prio);
 377
 378                 if (ctdb_client_async_control(ctdb,
 379                                         CTDB_CONTROL_SET_DB_PRIORITY,
 380                                         nodes, 0,
 381                                         CONTROL_TIMEOUT(), false, data,
 382                                         NULL, NULL,
 383                                         NULL) != 0) {
 384                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
 385                 }
 386         }
 387
 388         return 0;
 389 }
 390
 391 /*
 392   ensure all other nodes have attached to any databases that we have
 393  */
 394 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 395                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 396 {
 397         int i, j, db, ret;
 398         struct ctdb_dbid_map *remote_dbmap;
 399
 400         /* verify that all other nodes have all our databases */
 401         for (j=0; j<nodemap->num; j++) {
 402                 /* we dont need to ourself ourselves */
 403                 if (nodemap->nodes[j].pnn == pnn) {
 404                         continue;
 405                 }
 406                 /* dont check nodes that are unavailable */
 407                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 408                         continue;
 409                 }
 410
 411                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 412                                          mem_ctx, &remote_dbmap);
 413                 if (ret != 0) {
 414                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 415                         return -1;
 416                 }
 417
 418                 /* step through all local databases */
 419                 for (db=0; db<dbmap->num;db++) {
 420                         const char *name;
 421
 422
 423                         for (i=0;i<remote_dbmap->num;i++) {
 424                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 425                                         break;
 426                                 }
 427                         }
 428                         /* the remote node already have this database */
 429                         if (i!=remote_dbmap->num) {
 430                                 continue;
 431                         }
 432                         /* ok so we need to create this database */
 433                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
 434                                             mem_ctx, &name);
 435                         if (ret != 0) {
 436                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 437                                 return -1;
 438                         }
 439                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 440                                            mem_ctx, name, dbmap->dbs[db].persistent);
 441                         if (ret != 0) {
 442                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 443                                 return -1;
 444                         }
 445                 }
 446         }
 447
 448         return 0;
 449 }
 450
 451
 452 /*
 453   ensure we are attached to any databases that anyone else is attached to
 454  */
 455 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 456                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 457 {
 458         int i, j, db, ret;
 459         struct ctdb_dbid_map *remote_dbmap;
 460
 461         /* verify that we have all database any other node has */
 462         for (j=0; j<nodemap->num; j++) {
 463                 /* we dont need to ourself ourselves */
 464                 if (nodemap->nodes[j].pnn == pnn) {
 465                         continue;
 466                 }
 467                 /* dont check nodes that are unavailable */
 468                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 469                         continue;
 470                 }
 471
 472                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 473                                          mem_ctx, &remote_dbmap);
 474                 if (ret != 0) {
 475                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 476                         return -1;
 477                 }
 478
 479                 /* step through all databases on the remote node */
 480                 for (db=0; db<remote_dbmap->num;db++) {
 481                         const char *name;
 482
 483                         for (i=0;i<(*dbmap)->num;i++) {
 484                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 485                                         break;
 486                                 }
 487                         }
 488                         /* we already have this db locally */
 489                         if (i!=(*dbmap)->num) {
 490                                 continue;
 491                         }
 492                         /* ok so we need to create this database and
 493                            rebuild dbmap
 494                          */
 495                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 496                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 497                         if (ret != 0) {
 498                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 499                                           nodemap->nodes[j].pnn));
 500                                 return -1;
 501                         }
 502                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 503                                            remote_dbmap->dbs[db].persistent);
 504                         if (ret != 0) {
 505                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 506                                 return -1;
 507                         }
 508                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 509                         if (ret != 0) {
 510                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 511                                 return -1;
 512                         }
 513                 }
 514         }
 515
 516         return 0;
 517 }
 518
 519
 520 /*
 521   pull the remote database contents from one node into the recdb
 522  */
 523 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 524                                     struct tdb_wrap *recdb, uint32_t dbid)
 525 {
 526         int ret;
 527         TDB_DATA outdata;
 528         struct ctdb_marshall_buffer *reply;
 529         struct ctdb_rec_data *rec;
 530         int i;
 531         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 532
 533         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 534                                CONTROL_TIMEOUT(), &outdata);
 535         if (ret != 0) {
 536                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 537                 talloc_free(tmp_ctx);
 538                 return -1;
 539         }
 540
 541         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 542
 543         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 544                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 545                 talloc_free(tmp_ctx);
 546                 return -1;
 547         }
 548
 549         rec = (struct ctdb_rec_data *)&reply->data[0];
 550
 551         for (i=0;
 552              i<reply->count;
 553              rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
 554                 TDB_DATA key, data;
 555                 struct ctdb_ltdb_header *hdr;
 556                 TDB_DATA existing;
 557
 558                 key.dptr = &rec->data[0];
 559                 key.dsize = rec->keylen;
 560                 data.dptr = &rec->data[key.dsize];
 561                 data.dsize = rec->datalen;
 562
 563                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 564
 565                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 566                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 567                         talloc_free(tmp_ctx);
 568                         return -1;
 569                 }
 570
 571                 /* fetch the existing record, if any */
 572                 existing = tdb_fetch(recdb->tdb, key);
 573
 574                 if (existing.dptr != NULL) {
 575                         struct ctdb_ltdb_header header;
 576                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 577                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 578                                          (unsigned)existing.dsize, srcnode));
 579                                 free(existing.dptr);
 580                                 talloc_free(tmp_ctx);
 581                                 return -1;
 582                         }
 583                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 584                         free(existing.dptr);
 585                         if (!(header.rsn < hdr->rsn ||
 586                               (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
 587                                 continue;
 588                         }
 589                 }
 590
 591                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 592                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 593                         talloc_free(tmp_ctx);
 594                         return -1;
 595                 }
 596         }
 597
 598         talloc_free(tmp_ctx);
 599
 600         return 0;
 601 }
 602
 603 /*
 604   pull all the remote database contents into the recdb
 605  */
 606 static int pull_remote_database(struct ctdb_context *ctdb,
 607                                 struct ctdb_recoverd *rec,
 608                                 struct ctdb_node_map *nodemap,
 609                                 struct tdb_wrap *recdb, uint32_t dbid)
 610 {
 611         int j;
 612
 613         /* pull all records from all other nodes across onto this node
 614            (this merges based on rsn)
 615         */
 616         for (j=0; j<nodemap->num; j++) {
 617                 /* dont merge from nodes that are unavailable */
 618                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 619                         continue;
 620                 }
 621                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 622                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 623                                  nodemap->nodes[j].pnn));
 624                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 625                         return -1;
 626                 }
 627         }
 628
 629         return 0;
 630 }
 631
 632
 633 /*
 634   update flags on all active nodes
 635  */
 636 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 637 {
 638         int ret;
 639
 640         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 641                 if (ret != 0) {
 642                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 643                 return -1;
 644         }
 645
 646         return 0;
 647 }
 648
 649 /*
 650   ensure all nodes have the same vnnmap we do
 651  */
 652 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 653                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 654 {
 655         int j, ret;
 656
 657         /* push the new vnn map out to all the nodes */
 658         for (j=0; j<nodemap->num; j++) {
 659                 /* dont push to nodes that are unavailable */
 660                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 661                         continue;
 662                 }
 663
 664                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 665                 if (ret != 0) {
 666                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 667                         return -1;
 668                 }
 669         }
 670
 671         return 0;
 672 }
 673
 674
 675 struct vacuum_info {
 676         struct vacuum_info *next, *prev;
 677         struct ctdb_recoverd *rec;
 678         uint32_t srcnode;
 679         struct ctdb_db_context *ctdb_db;
 680         struct ctdb_marshall_buffer *recs;
 681         struct ctdb_rec_data *r;
 682 };
 683
 684 static void vacuum_fetch_next(struct vacuum_info *v);
 685
 686 /*
 687   called when a vacuum fetch has completed - just free it and do the next one
 688  */
 689 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 690 {
 691         struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
 692         talloc_free(state);
 693         vacuum_fetch_next(v);
 694 }
 695
 696
 697 /*
 698   process the next element from the vacuum list
 699 */
 700 static void vacuum_fetch_next(struct vacuum_info *v)
 701 {
 702         struct ctdb_call call;
 703         struct ctdb_rec_data *r;
 704
 705         while (v->recs->count) {
 706                 struct ctdb_client_call_state *state;
 707                 TDB_DATA data;
 708                 struct ctdb_ltdb_header *hdr;
 709
 710                 ZERO_STRUCT(call);
 711                 call.call_id = CTDB_NULL_FUNC;
 712                 call.flags = CTDB_IMMEDIATE_MIGRATION;
 713
 714                 r = v->r;
 715                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 716                 v->recs->count--;
 717
 718                 call.key.dptr = &r->data[0];
 719                 call.key.dsize = r->keylen;
 720
 721                 /* ensure we don't block this daemon - just skip a record if we can't get
 722                    the chainlock */
 723                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
 724                         continue;
 725                 }
 726
 727                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
 728                 if (data.dptr == NULL) {
 729                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 730                         continue;
 731                 }
 732
 733                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 734                         free(data.dptr);
 735                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 736                         continue;
 737                 }
 738
 739                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 740                 if (hdr->dmaster == v->rec->ctdb->pnn) {
 741                         /* its already local */
 742                         free(data.dptr);
 743                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 744                         continue;
 745                 }
 746
 747                 free(data.dptr);
 748
 749                 state = ctdb_call_send(v->ctdb_db, &call);
 750                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 751                 if (state == NULL) {
 752                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 753                         talloc_free(v);
 754                         return;
 755                 }
 756                 state->async.fn = vacuum_fetch_callback;
 757                 state->async.private_data = v;
 758                 return;
 759         }
 760
 761         talloc_free(v);
 762 }
 763
 764
 765 /*
 766   destroy a vacuum info structure
 767  */
 768 static int vacuum_info_destructor(struct vacuum_info *v)
 769 {
 770         DLIST_REMOVE(v->rec->vacuum_info, v);
 771         return 0;
 772 }
 773
 774
 775 /*
 776   handler for vacuum fetch
 777 */
 778 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 779                                  TDB_DATA data, void *private_data)
 780 {
 781         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
 782         struct ctdb_marshall_buffer *recs;
 783         int ret, i;
 784         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 785         const char *name;
 786         struct ctdb_dbid_map *dbmap=NULL;
 787         bool persistent = false;
 788         struct ctdb_db_context *ctdb_db;
 789         struct ctdb_rec_data *r;
 790         uint32_t srcnode;
 791         struct vacuum_info *v;
 792
 793         recs = (struct ctdb_marshall_buffer *)data.dptr;
 794         r = (struct ctdb_rec_data *)&recs->data[0];
 795
 796         if (recs->count == 0) {
 797                 talloc_free(tmp_ctx);
 798                 return;
 799         }
 800
 801         srcnode = r->reqid;
 802
 803         for (v=rec->vacuum_info;v;v=v->next) {
 804                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
 805                         /* we're already working on records from this node */
 806                         talloc_free(tmp_ctx);
 807                         return;
 808                 }
 809         }
 810
 811         /* work out if the database is persistent */
 812         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
 813         if (ret != 0) {
 814                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
 815                 talloc_free(tmp_ctx);
 816                 return;
 817         }
 818
 819         for (i=0;i<dbmap->num;i++) {
 820                 if (dbmap->dbs[i].dbid == recs->db_id) {
 821                         persistent = dbmap->dbs[i].persistent;
 822                         break;
 823                 }
 824         }
 825         if (i == dbmap->num) {
 826                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
 827                 talloc_free(tmp_ctx);
 828                 return;
 829         }
 830
 831         /* find the name of this database */
 832         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
 833                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
 834                 talloc_free(tmp_ctx);
 835                 return;
 836         }
 837
 838         /* attach to it */
 839         ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
 840         if (ctdb_db == NULL) {
 841                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
 842                 talloc_free(tmp_ctx);
 843                 return;
 844         }
 845
 846         v = talloc_zero(rec, struct vacuum_info);
 847         if (v == NULL) {
 848                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
 849                 talloc_free(tmp_ctx);
 850                 return;
 851         }
 852
 853         v->rec = rec;
 854         v->srcnode = srcnode;
 855         v->ctdb_db = ctdb_db;
 856         v->recs = talloc_memdup(v, recs, data.dsize);
 857         if (v->recs == NULL) {
 858                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
 859                 talloc_free(v);
 860                 talloc_free(tmp_ctx);
 861                 return;
 862         }
 863         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
 864
 865         DLIST_ADD(rec->vacuum_info, v);
 866
 867         talloc_set_destructor(v, vacuum_info_destructor);
 868
 869         vacuum_fetch_next(v);
 870         talloc_free(tmp_ctx);
 871 }
 872
 873
 874 /*
 875   called when ctdb_wait_timeout should finish
 876  */
 877 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
 878                               struct timeval yt, void *p)
 879 {
 880         uint32_t *timed_out = (uint32_t *)p;
 881         (*timed_out) = 1;
 882 }
 883
 884 /*
 885   wait for a given number of seconds
 886  */
 887 static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
 888 {
 889         uint32_t timed_out = 0;
 890         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
 891         while (!timed_out) {
 892                 event_loop_once(ctdb->ev);
 893         }
 894 }
 895
 896 /*
 897   called when an election times out (ends)
 898  */
 899 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
 900                                   struct timeval t, void *p)
 901 {
 902         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 903         rec->election_timeout = NULL;
 904
 905         DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
 906 }
 907
 908
 909 /*
 910   wait for an election to finish. It finished election_timeout seconds after
 911   the last election packet is received
 912  */
 913 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 914 {
 915         struct ctdb_context *ctdb = rec->ctdb;
 916         while (rec->election_timeout) {
 917                 event_loop_once(ctdb->ev);
 918         }
 919 }
 920
 921 /*
 922   Update our local flags from all remote connected nodes.
 923   This is only run when we are or we belive we are the recovery master
 924  */
 925 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 926 {
 927         int j;
 928         struct ctdb_context *ctdb = rec->ctdb;
 929         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 930
 931         /* get the nodemap for all active remote nodes and verify
 932            they are the same as for this node
 933          */
 934         for (j=0; j<nodemap->num; j++) {
 935                 struct ctdb_node_map *remote_nodemap=NULL;
 936                 int ret;
 937
 938                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 939                         continue;
 940                 }
 941                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 942                         continue;
 943                 }
 944
 945                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 946                                            mem_ctx, &remote_nodemap);
 947                 if (ret != 0) {
 948                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 949                                   nodemap->nodes[j].pnn));
 950                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 951                         talloc_free(mem_ctx);
 952                         return MONITOR_FAILED;
 953                 }
 954                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 955                         /* We should tell our daemon about this so it
 956                            updates its flags or else we will log the same
 957                            message again in the next iteration of recovery.
 958                            Since we are the recovery master we can just as
 959                            well update the flags on all nodes.
 960                         */
 961                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
 962                         if (ret != 0) {
 963                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 964                                 return -1;
 965                         }
 966
 967                         /* Update our local copy of the flags in the recovery
 968                            daemon.
 969                         */
 970                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 971                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 972                                  nodemap->nodes[j].flags));
 973                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 974                 }
 975                 talloc_free(remote_nodemap);
 976         }
 977         talloc_free(mem_ctx);
 978         return MONITOR_OK;
 979 }
 980
 981
 982 /* Create a new random generation ip.
 983    The generation id can not be the INVALID_GENERATION id
 984 */
 985 static uint32_t new_generation(void)
 986 {
 987         uint32_t generation;
 988
 989         while (1) {
 990                 generation = random();
 991
 992                 if (generation != INVALID_GENERATION) {
 993                         break;
 994                 }
 995         }
 996
 997         return generation;
 998 }
 999
1000
1001 /*
1002   create a temporary working database
1003  */
1004 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1005 {
1006         char *name;
1007         struct tdb_wrap *recdb;
1008         unsigned tdb_flags;
1009
1010         /* open up the temporary recovery database */
1011         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
1012         if (name == NULL) {
1013                 return NULL;
1014         }
1015         unlink(name);
1016
1017         tdb_flags = TDB_NOLOCK;
1018         if (!ctdb->do_setsched) {
1019                 tdb_flags |= TDB_NOMMAP;
1020         }
1021
1022         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1023                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1024         if (recdb == NULL) {
1025                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1026         }
1027
1028         talloc_free(name);
1029
1030         return recdb;
1031 }
1032
1033
1034 /*
1035    a traverse function for pulling all relevent records from recdb
1036  */
1037 struct recdb_data {
1038         struct ctdb_context *ctdb;
1039         struct ctdb_marshall_buffer *recdata;
1040         uint32_t len;
1041         bool failed;
1042 };
1043
1044 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1045 {
1046         struct recdb_data *params = (struct recdb_data *)p;
1047         struct ctdb_rec_data *rec;
1048         struct ctdb_ltdb_header *hdr;
1049
1050         /* skip empty records */
1051         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1052                 return 0;
1053         }
1054
1055         /* update the dmaster field to point to us */
1056         hdr = (struct ctdb_ltdb_header *)data.dptr;
1057         hdr->dmaster = params->ctdb->pnn;
1058
1059         /* add the record to the blob ready to send to the nodes */
1060         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1061         if (rec == NULL) {
1062                 params->failed = true;
1063                 return -1;
1064         }
1065         params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
1066         if (params->recdata == NULL) {
1067                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1068                          rec->length + params->len, params->recdata->count));
1069                 params->failed = true;
1070                 return -1;
1071         }
1072         params->recdata->count++;
1073         memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1074         params->len += rec->length;
1075         talloc_free(rec);
1076
1077         return 0;
1078 }
1079
1080 /*
1081   push the recdb database out to all nodes
1082  */
1083 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1084                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1085 {
1086         struct recdb_data params;
1087         struct ctdb_marshall_buffer *recdata;
1088         TDB_DATA outdata;
1089         TALLOC_CTX *tmp_ctx;
1090         uint32_t *nodes;
1091
1092         tmp_ctx = talloc_new(ctdb);
1093         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1094
1095         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1096         CTDB_NO_MEMORY(ctdb, recdata);
1097
1098         recdata->db_id = dbid;
1099
1100         params.ctdb = ctdb;
1101         params.recdata = recdata;
1102         params.len = offsetof(struct ctdb_marshall_buffer, data);
1103         params.failed = false;
1104
1105         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1106                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1107                 talloc_free(params.recdata);
1108                 talloc_free(tmp_ctx);
1109                 return -1;
1110         }
1111
1112         if (params.failed) {
1113                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1114                 talloc_free(params.recdata);
1115                 talloc_free(tmp_ctx);
1116                 return -1;
1117         }
1118
1119         recdata = params.recdata;
1120
1121         outdata.dptr = (void *)recdata;
1122         outdata.dsize = params.len;
1123
1124         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1125         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1126                                         nodes, 0,
1127                                         CONTROL_TIMEOUT(), false, outdata,
1128                                         NULL, NULL,
1129                                         NULL) != 0) {
1130                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1131                 talloc_free(recdata);
1132                 talloc_free(tmp_ctx);
1133                 return -1;
1134         }
1135
1136         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1137                   dbid, recdata->count));
1138
1139         talloc_free(recdata);
1140         talloc_free(tmp_ctx);
1141
1142         return 0;
1143 }
1144
1145
1146 /*
1147   go through a full recovery on one database
1148  */
1149 static int recover_database(struct ctdb_recoverd *rec,
1150                             TALLOC_CTX *mem_ctx,
1151                             uint32_t dbid,
1152                             uint32_t pnn,
1153                             struct ctdb_node_map *nodemap,
1154                             uint32_t transaction_id)
1155 {
1156         struct tdb_wrap *recdb;
1157         int ret;
1158         struct ctdb_context *ctdb = rec->ctdb;
1159         TDB_DATA data;
1160         struct ctdb_control_wipe_database w;
1161         uint32_t *nodes;
1162
1163         recdb = create_recdb(ctdb, mem_ctx);
1164         if (recdb == NULL) {
1165                 return -1;
1166         }
1167
1168         /* pull all remote databases onto the recdb */
1169         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
1170         if (ret != 0) {
1171                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1172                 return -1;
1173         }
1174
1175         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1176
1177         /* wipe all the remote databases. This is safe as we are in a transaction */
1178         w.db_id = dbid;
1179         w.transaction_id = transaction_id;
1180
1181         data.dptr = (void *)&w;
1182         data.dsize = sizeof(w);
1183
1184         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1185         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1186                                         nodes, 0,
1187                                         CONTROL_TIMEOUT(), false, data,
1188                                         NULL, NULL,
1189                                         NULL) != 0) {
1190                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1191                 talloc_free(recdb);
1192                 return -1;
1193         }
1194
1195         /* push out the correct database. This sets the dmaster and skips
1196            the empty records */
1197         ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
1198         if (ret != 0) {
1199                 talloc_free(recdb);
1200                 return -1;
1201         }
1202
1203         /* all done with this database */
1204         talloc_free(recdb);
1205
1206         return 0;
1207 }
1208
1209 /*
1210   reload the nodes file
1211 */
1212 static void reload_nodes_file(struct ctdb_context *ctdb)
1213 {
1214         ctdb->nodes = NULL;
1215         ctdb_load_nodes_file(ctdb);
1216 }
1217
1218
1219 /*
1220   we are the recmaster, and recovery is needed - start a recovery run
1221  */
1222 static int do_recovery(struct ctdb_recoverd *rec,
1223                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1224                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1225 {
1226         struct ctdb_context *ctdb = rec->ctdb;
1227         int i, j, ret;
1228         uint32_t generation;
1229         struct ctdb_dbid_map *dbmap;
1230         TDB_DATA data;
1231         uint32_t *nodes;
1232         struct timeval start_time;
1233
1234         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1235
1236         /* if recovery fails, force it again */
1237         rec->need_recovery = true;
1238
1239         for (i=0; i<ctdb->num_nodes; i++) {
1240                 struct ctdb_banning_state *ban_state;
1241
1242                 if (ctdb->nodes[i]->ban_state == NULL) {
1243                         continue;
1244                 }
1245                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1246                 if (ban_state->count < 2*ctdb->num_nodes) {
1247                         continue;
1248                 }
1249                 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1250                         ctdb->nodes[i]->pnn, ban_state->count,
1251                         ctdb->tunable.recovery_ban_period));
1252                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1253                 ban_state->count = 0;
1254         }
1255
1256
1257         if (ctdb->tunable.verify_recovery_lock != 0) {
1258                 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1259                 start_time = timeval_current();
1260                 if (!ctdb_recovery_lock(ctdb, true)) {
1261                         ctdb_set_culprit(rec, pnn);
1262                         DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
1263                         return -1;
1264                 }
1265                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1266                 DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
1267         }
1268
1269         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1270
1271         /* get a list of all databases */
1272         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1273         if (ret != 0) {
1274                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1275                 return -1;
1276         }
1277
1278         /* we do the db creation before we set the recovery mode, so the freeze happens
1279            on all databases we will be dealing with. */
1280
1281         /* verify that we have all the databases any other node has */
1282         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1283         if (ret != 0) {
1284                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1285                 return -1;
1286         }
1287
1288         /* verify that all other nodes have all our databases */
1289         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1290         if (ret != 0) {
1291                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1292                 return -1;
1293         }
1294         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1295
1296         /* update the database priority for all remote databases */
1297         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1298         if (ret != 0) {
1299                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1300         }
1301         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1302
1303
1304         /* set recovery mode to active on all nodes */
1305         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1306         if (ret != 0) {
1307                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1308                 return -1;
1309         }
1310
1311         /* execute the "startrecovery" event script on all nodes */
1312         ret = run_startrecovery_eventscript(rec, nodemap);
1313         if (ret!=0) {
1314                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1315                 return -1;
1316         }
1317
1318         /* pick a new generation number */
1319         generation = new_generation();
1320
1321         /* change the vnnmap on this node to use the new generation
1322            number but not on any other nodes.
1323            this guarantees that if we abort the recovery prematurely
1324            for some reason (a node stops responding?)
1325            that we can just return immediately and we will reenter
1326            recovery shortly again.
1327            I.e. we deliberately leave the cluster with an inconsistent
1328            generation id to allow us to abort recovery at any stage and
1329            just restart it from scratch.
1330          */
1331         vnnmap->generation = generation;
1332         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1333         if (ret != 0) {
1334                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1335                 return -1;
1336         }
1337
1338         data.dptr = (void *)&generation;
1339         data.dsize = sizeof(uint32_t);
1340
1341         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1342         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1343                                         nodes, 0,
1344                                         CONTROL_TIMEOUT(), false, data,
1345                                         NULL,
1346                                         transaction_start_fail_callback,
1347                                         rec) != 0) {
1348                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1349                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1350                                         nodes, 0,
1351                                         CONTROL_TIMEOUT(), false, tdb_null,
1352                                         NULL,
1353                                         NULL,
1354                                         NULL) != 0) {
1355                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1356                 }
1357                 return -1;
1358         }
1359
1360         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1361
1362         for (i=0;i<dbmap->num;i++) {
1363                 if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
1364                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1365                         return -1;
1366                 }
1367         }
1368
1369         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1370
1371         /* commit all the changes */
1372         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1373                                         nodes, 0,
1374                                         CONTROL_TIMEOUT(), false, data,
1375                                         NULL, NULL,
1376                                         NULL) != 0) {
1377                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1378                 return -1;
1379         }
1380
1381         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1382
1383
1384         /* update the capabilities for all nodes */
1385         ret = update_capabilities(ctdb, nodemap);
1386         if (ret!=0) {
1387                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1388                 return -1;
1389         }
1390
1391         /* build a new vnn map with all the currently active and
1392            unbanned nodes */
1393         generation = new_generation();
1394         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1395         CTDB_NO_MEMORY(ctdb, vnnmap);
1396         vnnmap->generation = generation;
1397         vnnmap->size = 0;
1398         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1399         CTDB_NO_MEMORY(ctdb, vnnmap->map);
1400         for (i=j=0;i<nodemap->num;i++) {
1401                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1402                         continue;
1403                 }
1404                 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1405                         /* this node can not be an lmaster */
1406                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1407                         continue;
1408                 }
1409
1410                 vnnmap->size++;
1411                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1412                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1413                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1414
1415         }
1416         if (vnnmap->size == 0) {
1417                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1418                 vnnmap->size++;
1419                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1420                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1421                 vnnmap->map[0] = pnn;
1422         }
1423
1424         /* update to the new vnnmap on all nodes */
1425         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1426         if (ret != 0) {
1427                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1428                 return -1;
1429         }
1430
1431         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1432
1433         /* update recmaster to point to us for all nodes */
1434         ret = set_recovery_master(ctdb, nodemap, pnn);
1435         if (ret!=0) {
1436                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1437                 return -1;
1438         }
1439
1440         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1441
1442         /*
1443           update all nodes to have the same flags that we have
1444          */
1445         for (i=0;i<nodemap->num;i++) {
1446                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1447                         continue;
1448                 }
1449
1450                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1451                 if (ret != 0) {
1452                         DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1453                         return -1;
1454                 }
1455         }
1456
1457         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1458
1459         /* disable recovery mode */
1460         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1461         if (ret != 0) {
1462                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1463                 return -1;
1464         }
1465
1466         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1467
1468         /*
1469           tell nodes to takeover their public IPs
1470          */
1471         rec->need_takeover_run = false;
1472         ret = ctdb_takeover_run(ctdb, nodemap);
1473         if (ret != 0) {
1474                 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
1475                 return -1;
1476         }
1477         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
1478
1479         /* execute the "recovered" event script on all nodes */
1480         ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
1481         if (ret!=0) {
1482                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1483                 return -1;
1484         }
1485
1486         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1487
1488         /* send a message to all clients telling them that the cluster
1489            has been reconfigured */
1490         ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1491
1492         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1493
1494         rec->need_recovery = false;
1495
1496         /* we managed to complete a full recovery, make sure to forgive
1497            any past sins by the nodes that could now participate in the
1498            recovery.
1499         */
1500         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1501         for (i=0;i<nodemap->num;i++) {
1502                 struct ctdb_banning_state *ban_state;
1503
1504                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1505                         continue;
1506                 }
1507
1508                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1509                 if (ban_state == NULL) {
1510                         continue;
1511                 }
1512
1513                 ban_state->count = 0;
1514         }
1515
1516
1517         /* We just finished a recovery successfully.
1518            We now wait for rerecovery_timeout before we allow
1519            another recovery to take place.
1520         */
1521         DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
1522         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1523         DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
1524
1525         return 0;
1526 }
1527
1528
1529 /*
1530   elections are won by first checking the number of connected nodes, then
1531   the priority time, then the pnn
1532  */
1533 struct election_message {
1534         uint32_t num_connected;
1535         struct timeval priority_time;
1536         uint32_t pnn;
1537         uint32_t node_flags;
1538 };
1539
1540 /*
1541   form this nodes election data
1542  */
1543 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1544 {
1545         int ret, i;
1546         struct ctdb_node_map *nodemap;
1547         struct ctdb_context *ctdb = rec->ctdb;
1548
1549         ZERO_STRUCTP(em);
1550
1551         em->pnn = rec->ctdb->pnn;
1552         em->priority_time = rec->priority_time;
1553
1554         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1555         if (ret != 0) {
1556                 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1557                 return;
1558         }
1559
1560         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1561         em->node_flags = rec->node_flags;
1562
1563         for (i=0;i<nodemap->num;i++) {
1564                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1565                         em->num_connected++;
1566                 }
1567         }
1568
1569         /* we shouldnt try to win this election if we cant be a recmaster */
1570         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1571                 em->num_connected = 0;
1572                 em->priority_time = timeval_current();
1573         }
1574
1575         talloc_free(nodemap);
1576 }
1577
1578 /*
1579   see if the given election data wins
1580  */
1581 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1582 {
1583         struct election_message myem;
1584         int cmp = 0;
1585
1586         ctdb_election_data(rec, &myem);
1587
1588         /* we cant win if we dont have the recmaster capability */
1589         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1590                 return false;
1591         }
1592
1593         /* we cant win if we are banned */
1594         if (rec->node_flags & NODE_FLAGS_BANNED) {
1595                 return false;
1596         }
1597
1598         /* we cant win if we are stopped */
1599         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1600                 return false;
1601         }
1602
1603         /* we will automatically win if the other node is banned */
1604         if (em->node_flags & NODE_FLAGS_BANNED) {
1605                 return true;
1606         }
1607
1608         /* we will automatically win if the other node is banned */
1609         if (em->node_flags & NODE_FLAGS_STOPPED) {
1610                 return true;
1611         }
1612
1613         /* try to use the most connected node */
1614         if (cmp == 0) {
1615                 cmp = (int)myem.num_connected - (int)em->num_connected;
1616         }
1617
1618         /* then the longest running node */
1619         if (cmp == 0) {
1620                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1621         }
1622
1623         if (cmp == 0) {
1624                 cmp = (int)myem.pnn - (int)em->pnn;
1625         }
1626
1627         return cmp > 0;
1628 }
1629
1630 /*
1631   send out an election request
1632  */
1633 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1634 {
1635         int ret;
1636         TDB_DATA election_data;
1637         struct election_message emsg;
1638         uint64_t srvid;
1639         struct ctdb_context *ctdb = rec->ctdb;
1640
1641         srvid = CTDB_SRVID_RECOVERY;
1642
1643         ctdb_election_data(rec, &emsg);
1644
1645         election_data.dsize = sizeof(struct election_message);
1646         election_data.dptr  = (unsigned char *)&emsg;
1647
1648
1649         /* send an election message to all active nodes */
1650         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1651         ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1652
1653
1654         /* A new node that is already frozen has entered the cluster.
1655            The existing nodes are not frozen and dont need to be frozen
1656            until the election has ended and we start the actual recovery
1657         */
1658         if (update_recmaster == true) {
1659                 /* first we assume we will win the election and set
1660                    recoverymaster to be ourself on the current node
1661                  */
1662                 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1663                 if (ret != 0) {
1664                         DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1665                         return -1;
1666                 }
1667         }
1668
1669
1670         return 0;
1671 }
1672
1673 /*
1674   this function will unban all nodes in the cluster
1675 */
1676 static void unban_all_nodes(struct ctdb_context *ctdb)
1677 {
1678         int ret, i;
1679         struct ctdb_node_map *nodemap;
1680         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1681
1682         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1683         if (ret != 0) {
1684                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
1685                 return;
1686         }
1687
1688         for (i=0;i<nodemap->num;i++) {
1689                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
1690                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
1691                         ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
1692                 }
1693         }
1694
1695         talloc_free(tmp_ctx);
1696 }
1697
1698
1699 /*
1700   we think we are winning the election - send a broadcast election request
1701  */
1702 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1703 {
1704         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1705         int ret;
1706
1707         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
1708         if (ret != 0) {
1709                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1710         }
1711
1712         talloc_free(rec->send_election_te);
1713         rec->send_election_te = NULL;
1714 }
1715
1716 /*
1717   handler for memory dumps
1718 */
1719 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
1720                              TDB_DATA data, void *private_data)
1721 {
1722         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1723         TDB_DATA *dump;
1724         int ret;
1725         struct rd_memdump_reply *rd;
1726
1727         if (data.dsize != sizeof(struct rd_memdump_reply)) {
1728                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1729                 talloc_free(tmp_ctx);
1730                 return;
1731         }
1732         rd = (struct rd_memdump_reply *)data.dptr;
1733
1734         dump = talloc_zero(tmp_ctx, TDB_DATA);
1735         if (dump == NULL) {
1736                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1737                 talloc_free(tmp_ctx);
1738                 return;
1739         }
1740         ret = ctdb_dump_memory(ctdb, dump);
1741         if (ret != 0) {
1742                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1743                 talloc_free(tmp_ctx);
1744                 return;
1745         }
1746
1747 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1748
1749         ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1750         if (ret != 0) {
1751                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1752                 talloc_free(tmp_ctx);
1753                 return;
1754         }
1755
1756         talloc_free(tmp_ctx);
1757 }
1758
1759 /*
1760   handler for reload_nodes
1761 */
1762 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
1763                              TDB_DATA data, void *private_data)
1764 {
1765         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1766
1767         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1768
1769         reload_nodes_file(rec->ctdb);
1770 }
1771
1772
1773 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
1774                               struct timeval yt, void *p)
1775 {
1776         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1777
1778         talloc_free(rec->ip_check_disable_ctx);
1779         rec->ip_check_disable_ctx = NULL;
1780 }
1781
1782 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
1783                              TDB_DATA data, void *private_data)
1784 {
1785         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1786         uint32_t timeout;
1787
1788         if (rec->ip_check_disable_ctx != NULL) {
1789                 talloc_free(rec->ip_check_disable_ctx);
1790                 rec->ip_check_disable_ctx = NULL;
1791         }
1792
1793         if (data.dsize != sizeof(uint32_t)) {
1794                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu expexting %lu\n", data.dsize, sizeof(uint32_t)));
1795                 return;
1796         }
1797         if (data.dptr == NULL) {
1798                 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
1799                 return;
1800         }
1801
1802         timeout = *((uint32_t *)data.dptr);
1803         DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
1804
1805         rec->ip_check_disable_ctx = talloc_new(rec);
1806         CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
1807
1808         event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
1809 }
1810
1811
1812 /*
1813   handler for ip reallocate, just add it to the list of callers and
1814   handle this later in the monitor_cluster loop so we do not recurse
1815   with other callers to takeover_run()
1816 */
1817 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
1818                              TDB_DATA data, void *private_data)
1819 {
1820         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1821         struct ip_reallocate_list *caller;
1822
1823         if (data.dsize != sizeof(struct rd_memdump_reply)) {
1824                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1825                 return;
1826         }
1827
1828         if (rec->ip_reallocate_ctx == NULL) {
1829                 rec->ip_reallocate_ctx = talloc_new(rec);
1830                 CTDB_NO_MEMORY_FATAL(ctdb, caller);
1831         }
1832
1833         caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
1834         CTDB_NO_MEMORY_FATAL(ctdb, caller);
1835
1836         caller->rd   = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
1837         caller->next = rec->reallocate_callers;
1838         rec->reallocate_callers = caller;
1839
1840         return;
1841 }
1842
1843 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
1844 {
1845         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1846         TDB_DATA result;
1847         int32_t ret;
1848         struct ip_reallocate_list *callers;
1849
1850         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
1851         ret = ctdb_takeover_run(ctdb, rec->nodemap);
1852         result.dsize = sizeof(int32_t);
1853         result.dptr  = (uint8_t *)&ret;
1854
1855         for (callers=rec->reallocate_callers; callers; callers=callers->next) {
1856                 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1857                 ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
1858                 if (ret != 0) {
1859                         DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply message to %u:%lu\n", callers->rd->pnn, callers->rd->srvid));
1860                 }
1861         }
1862
1863         talloc_free(tmp_ctx);
1864         talloc_free(rec->ip_reallocate_ctx);
1865         rec->ip_reallocate_ctx = NULL;
1866         rec->reallocate_callers = NULL;
1867
1868 }
1869
1870
1871 /*
1872   handler for recovery master elections
1873 */
1874 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
1875                              TDB_DATA data, void *private_data)
1876 {
1877         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1878         int ret;
1879         struct election_message *em = (struct election_message *)data.dptr;
1880         TALLOC_CTX *mem_ctx;
1881
1882         /* we got an election packet - update the timeout for the election */
1883         talloc_free(rec->election_timeout);
1884         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1885                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1886                                                 ctdb_election_timeout, rec);
1887
1888         mem_ctx = talloc_new(ctdb);
1889
1890         /* someone called an election. check their election data
1891            and if we disagree and we would rather be the elected node,
1892            send a new election message to all other nodes
1893          */
1894         if (ctdb_election_win(rec, em)) {
1895                 if (!rec->send_election_te) {
1896                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
1897                                                                 timeval_current_ofs(0, 500000),
1898                                                                 election_send_request, rec);
1899                 }
1900                 talloc_free(mem_ctx);
1901                 /*unban_all_nodes(ctdb);*/
1902                 return;
1903         }
1904
1905         /* we didn't win */
1906         talloc_free(rec->send_election_te);
1907         rec->send_election_te = NULL;
1908
1909         if (ctdb->tunable.verify_recovery_lock != 0) {
1910                 /* release the recmaster lock */
1911                 if (em->pnn != ctdb->pnn &&
1912                     ctdb->recovery_lock_fd != -1) {
1913                         close(ctdb->recovery_lock_fd);
1914                         ctdb->recovery_lock_fd = -1;
1915                         unban_all_nodes(ctdb);
1916                 }
1917         }
1918
1919         /* ok, let that guy become recmaster then */
1920         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
1921         if (ret != 0) {
1922                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
1923                 talloc_free(mem_ctx);
1924                 return;
1925         }
1926
1927         talloc_free(mem_ctx);
1928         return;
1929 }
1930
1931
1932 /*
1933   force the start of the election process
1934  */
1935 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1936                            struct ctdb_node_map *nodemap)
1937 {
1938         int ret;
1939         struct ctdb_context *ctdb = rec->ctdb;
1940
1941         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1942
1943         /* set all nodes to recovery mode to stop all internode traffic */
1944         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1945         if (ret != 0) {
1946                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1947                 return;
1948         }
1949
1950         talloc_free(rec->election_timeout);
1951         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
1952                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1953                                                 ctdb_election_timeout, rec);
1954
1955         ret = send_election_request(rec, pnn, true);
1956         if (ret!=0) {
1957                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1958                 return;
1959         }
1960
1961         /* wait for a few seconds to collect all responses */
1962         ctdb_wait_election(rec);
1963 }
1964
1965
1966
1967 /*
1968   handler for when a node changes its flags
1969 */
1970 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
1971                             TDB_DATA data, void *private_data)
1972 {
1973         int ret;
1974         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1975         struct ctdb_node_map *nodemap=NULL;
1976         TALLOC_CTX *tmp_ctx;
1977         uint32_t changed_flags;
1978         int i;
1979         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1980         int disabled_flag_changed;
1981
1982         if (data.dsize != sizeof(*c)) {
1983                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1984                 return;
1985         }
1986
1987         tmp_ctx = talloc_new(ctdb);
1988         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1989
1990         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1991         if (ret != 0) {
1992                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1993                 talloc_free(tmp_ctx);
1994                 return;
1995         }
1996
1997
1998         for (i=0;i<nodemap->num;i++) {
1999                 if (nodemap->nodes[i].pnn == c->pnn) break;
2000         }
2001
2002         if (i == nodemap->num) {
2003                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2004                 talloc_free(tmp_ctx);
2005                 return;
2006         }
2007
2008         changed_flags = c->old_flags ^ c->new_flags;
2009
2010         if (nodemap->nodes[i].flags != c->new_flags) {
2011                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2012         }
2013
2014         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2015
2016         nodemap->nodes[i].flags = c->new_flags;
2017
2018         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2019                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2020
2021         if (ret == 0) {
2022                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2023                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2024         }
2025
2026         if (ret == 0 &&
2027             ctdb->recovery_master == ctdb->pnn &&
2028             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2029                 /* Only do the takeover run if the perm disabled or unhealthy
2030                    flags changed since these will cause an ip failover but not
2031                    a recovery.
2032                    If the node became disconnected or banned this will also
2033                    lead to an ip address failover but that is handled
2034                    during recovery
2035                 */
2036                 if (disabled_flag_changed) {
2037                         rec->need_takeover_run = true;
2038                 }
2039         }
2040
2041         talloc_free(tmp_ctx);
2042 }
2043
2044 /*
2045   handler for when we need to push out flag changes ot all other nodes
2046 */
2047 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2048                             TDB_DATA data, void *private_data)
2049 {
2050         int ret;
2051         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2052
2053         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
2054         if (ret != 0) {
2055                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
2056         }
2057 }
2058
2059
2060 struct verify_recmode_normal_data {
2061         uint32_t count;
2062         enum monitor_result status;
2063 };
2064
2065 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2066 {
2067         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2068
2069
2070         /* one more node has responded with recmode data*/
2071         rmdata->count--;
2072
2073         /* if we failed to get the recmode, then return an error and let
2074            the main loop try again.
2075         */
2076         if (state->state != CTDB_CONTROL_DONE) {
2077                 if (rmdata->status == MONITOR_OK) {
2078                         rmdata->status = MONITOR_FAILED;
2079                 }
2080                 return;
2081         }
2082
2083         /* if we got a response, then the recmode will be stored in the
2084            status field
2085         */
2086         if (state->status != CTDB_RECOVERY_NORMAL) {
2087                 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2088                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2089         }
2090
2091         return;
2092 }
2093
2094
2095 /* verify that all nodes are in normal recovery mode */
2096 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2097 {
2098         struct verify_recmode_normal_data *rmdata;
2099         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2100         struct ctdb_client_control_state *state;
2101         enum monitor_result status;
2102         int j;
2103
2104         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2105         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2106         rmdata->count  = 0;
2107         rmdata->status = MONITOR_OK;
2108
2109         /* loop over all active nodes and send an async getrecmode call to
2110            them*/
2111         for (j=0; j<nodemap->num; j++) {
2112                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2113                         continue;
2114                 }
2115                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2116                                         CONTROL_TIMEOUT(),
2117                                         nodemap->nodes[j].pnn);
2118                 if (state == NULL) {
2119                         /* we failed to send the control, treat this as
2120                            an error and try again next iteration
2121                         */
2122                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2123                         talloc_free(mem_ctx);
2124                         return MONITOR_FAILED;
2125                 }
2126
2127                 /* set up the callback functions */
2128                 state->async.fn = verify_recmode_normal_callback;
2129                 state->async.private_data = rmdata;
2130
2131                 /* one more control to wait for to complete */
2132                 rmdata->count++;
2133         }
2134
2135
2136         /* now wait for up to the maximum number of seconds allowed
2137            or until all nodes we expect a response from has replied
2138         */
2139         while (rmdata->count > 0) {
2140                 event_loop_once(ctdb->ev);
2141         }
2142
2143         status = rmdata->status;
2144         talloc_free(mem_ctx);
2145         return status;
2146 }
2147
2148
2149 struct verify_recmaster_data {
2150         struct ctdb_recoverd *rec;
2151         uint32_t count;
2152         uint32_t pnn;
2153         enum monitor_result status;
2154 };
2155
2156 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2157 {
2158         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2159
2160
2161         /* one more node has responded with recmaster data*/
2162         rmdata->count--;
2163
2164         /* if we failed to get the recmaster, then return an error and let
2165            the main loop try again.
2166         */
2167         if (state->state != CTDB_CONTROL_DONE) {
2168                 if (rmdata->status == MONITOR_OK) {
2169                         rmdata->status = MONITOR_FAILED;
2170                 }
2171                 return;
2172         }
2173
2174         /* if we got a response, then the recmaster will be stored in the
2175            status field
2176         */
2177         if (state->status != rmdata->pnn) {
2178                 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2179                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2180                 rmdata->status = MONITOR_ELECTION_NEEDED;
2181         }
2182
2183         return;
2184 }
2185
2186
2187 /* verify that all nodes agree that we are the recmaster */
2188 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2189 {
2190         struct ctdb_context *ctdb = rec->ctdb;
2191         struct verify_recmaster_data *rmdata;
2192         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2193         struct ctdb_client_control_state *state;
2194         enum monitor_result status;
2195         int j;
2196
2197         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2198         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2199         rmdata->rec    = rec;
2200         rmdata->count  = 0;
2201         rmdata->pnn    = pnn;
2202         rmdata->status = MONITOR_OK;
2203
2204         /* loop over all active nodes and send an async getrecmaster call to
2205            them*/
2206         for (j=0; j<nodemap->num; j++) {
2207                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2208                         continue;
2209                 }
2210                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2211                                         CONTROL_TIMEOUT(),
2212                                         nodemap->nodes[j].pnn);
2213                 if (state == NULL) {
2214                         /* we failed to send the control, treat this as
2215                            an error and try again next iteration
2216                         */
2217                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2218                         talloc_free(mem_ctx);
2219                         return MONITOR_FAILED;
2220                 }
2221
2222                 /* set up the callback functions */
2223                 state->async.fn = verify_recmaster_callback;
2224                 state->async.private_data = rmdata;
2225
2226                 /* one more control to wait for to complete */
2227                 rmdata->count++;
2228         }
2229
2230
2231         /* now wait for up to the maximum number of seconds allowed
2232            or until all nodes we expect a response from has replied
2233         */
2234         while (rmdata->count > 0) {
2235                 event_loop_once(ctdb->ev);
2236         }
2237
2238         status = rmdata->status;
2239         talloc_free(mem_ctx);
2240         return status;
2241 }
2242
2243
2244 /* called to check that the allocation of public ip addresses is ok.
2245 */
2246 static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
2247 {
2248         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2249         struct ctdb_all_public_ips *ips = NULL;
2250         struct ctdb_uptime *uptime1 = NULL;
2251         struct ctdb_uptime *uptime2 = NULL;
2252         int ret, j;
2253
2254         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2255                                 CTDB_CURRENT_NODE, &uptime1);
2256         if (ret != 0) {
2257                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2258                 talloc_free(mem_ctx);
2259                 return -1;
2260         }
2261
2262         /* read the ip allocation from the local node */
2263         ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
2264         if (ret != 0) {
2265                 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node %u\n", pnn));
2266                 talloc_free(mem_ctx);
2267                 return -1;
2268         }
2269
2270         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2271                                 CTDB_CURRENT_NODE, &uptime2);
2272         if (ret != 0) {
2273                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2274                 talloc_free(mem_ctx);
2275                 return -1;
2276         }
2277
2278         /* skip the check if the startrecovery time has changed */
2279         if (timeval_compare(&uptime1->last_recovery_started,
2280                             &uptime2->last_recovery_started) != 0) {
2281                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2282                 talloc_free(mem_ctx);
2283                 return 0;
2284         }
2285
2286         /* skip the check if the endrecovery time has changed */
2287         if (timeval_compare(&uptime1->last_recovery_finished,
2288                             &uptime2->last_recovery_finished) != 0) {
2289                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2290                 talloc_free(mem_ctx);
2291                 return 0;
2292         }
2293
2294         /* skip the check if we have started but not finished recovery */
2295         if (timeval_compare(&uptime1->last_recovery_finished,
2296                             &uptime1->last_recovery_started) != 1) {
2297                 DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery. skipping public ip address check\n"));
2298                 talloc_free(mem_ctx);
2299
2300                 return 0;
2301         }
2302
2303         /* verify that we have the ip addresses we should have
2304            and we dont have ones we shouldnt have.
2305            if we find an inconsistency we set recmode to
2306            active on the local node and wait for the recmaster
2307            to do a full blown recovery
2308         */
2309         for (j=0; j<ips->num; j++) {
2310                 if (ips->ips[j].pnn == pnn) {
2311                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2312                                 DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
2313                                         ctdb_addr_to_str(&ips->ips[j].addr)));
2314                                 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2315                                 if (ret != 0) {
2316                                         DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2317
2318                                         talloc_free(mem_ctx);
2319                                         return -1;
2320                                 }
2321                                 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2322                                 if (ret != 0) {
2323                                         DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2324
2325                                         talloc_free(mem_ctx);
2326                                         return -1;
2327                                 }
2328                         }
2329                 } else {
2330                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2331                                 DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n",
2332                                         ctdb_addr_to_str(&ips->ips[j].addr)));
2333
2334                                 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2335                                 if (ret != 0) {
2336                                         DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
2337
2338                                         talloc_free(mem_ctx);
2339                                         return -1;
2340                                 }
2341                                 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2342                                 if (ret != 0) {
2343                                         DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
2344
2345                                         talloc_free(mem_ctx);
2346                                         return -1;
2347                                 }
2348                         }
2349                 }
2350         }
2351
2352         talloc_free(mem_ctx);
2353         return 0;
2354 }
2355
2356
2357 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2358 {
2359         struct ctdb_node_map **remote_nodemaps = callback_data;
2360
2361         if (node_pnn >= ctdb->num_nodes) {
2362                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2363                 return;
2364         }
2365
2366         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
2367
2368 }
2369
2370 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2371         struct ctdb_node_map *nodemap,
2372         struct ctdb_node_map **remote_nodemaps)
2373 {
2374         uint32_t *nodes;
2375
2376         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2377         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2378                                         nodes, 0,
2379                                         CONTROL_TIMEOUT(), false, tdb_null,
2380                                         async_getnodemap_callback,
2381                                         NULL,
2382                                         remote_nodemaps) != 0) {
2383                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2384
2385                 return -1;
2386         }
2387
2388         return 0;
2389 }
2390
2391 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
2392 struct ctdb_check_reclock_state {
2393         struct ctdb_context *ctdb;
2394         struct timeval start_time;
2395         int fd[2];
2396         pid_t child;
2397         struct timed_event *te;
2398         struct fd_event *fde;
2399         enum reclock_child_status status;
2400 };
2401
2402 /* when we free the reclock state we must kill any child process.
2403 */
2404 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
2405 {
2406         struct ctdb_context *ctdb = state->ctdb;
2407
2408         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
2409
2410         if (state->fd[0] != -1) {
2411                 close(state->fd[0]);
2412                 state->fd[0] = -1;
2413         }
2414         if (state->fd[1] != -1) {
2415                 close(state->fd[1]);
2416                 state->fd[1] = -1;
2417         }
2418         kill(state->child, SIGKILL);
2419         return 0;
2420 }
2421
2422 /*
2423   called if our check_reclock child times out. this would happen if
2424   i/o to the reclock file blocks.
2425  */
2426 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
2427                                          struct timeval t, void *private_data)
2428 {
2429         struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
2430                                            struct ctdb_check_reclock_state);
2431
2432         DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
2433         state->status = RECLOCK_TIMEOUT;
2434 }
2435
2436 /* this is called when the child process has completed checking the reclock
2437    file and has written data back to us through the pipe.
2438 */
2439 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
2440                              uint16_t flags, void *private_data)
2441 {
2442         struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
2443                                              struct ctdb_check_reclock_state);
2444         char c = 0;
2445         int ret;
2446
2447         /* we got a response from our child process so we can abort the
2448            timeout.
2449         */
2450         talloc_free(state->te);
2451         state->te = NULL;
2452
2453         ret = read(state->fd[0], &c, 1);
2454         if (ret != 1 || c != RECLOCK_OK) {
2455                 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
2456                 state->status = RECLOCK_FAILED;
2457
2458                 return;
2459         }
2460
2461         state->status = RECLOCK_OK;
2462         return;
2463 }
2464
2465 static int check_recovery_lock(struct ctdb_context *ctdb)
2466 {
2467         int ret;
2468         struct ctdb_check_reclock_state *state;
2469         pid_t parent = getpid();
2470
2471         if (ctdb->recovery_lock_fd == -1) {
2472                 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
2473                 return -1;
2474         }
2475
2476         state = talloc(ctdb, struct ctdb_check_reclock_state);
2477         CTDB_NO_MEMORY(ctdb, state);
2478
2479         state->ctdb = ctdb;
2480         state->start_time = timeval_current();
2481         state->status = RECLOCK_CHECKING;
2482         state->fd[0] = -1;
2483         state->fd[1] = -1;
2484
2485         ret = pipe(state->fd);
2486         if (ret != 0) {
2487                 talloc_free(state);
2488                 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
2489                 return -1;
2490         }
2491
2492         state->child = fork();
2493         if (state->child == (pid_t)-1) {
2494                 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
2495                 close(state->fd[0]);
2496                 state->fd[0] = -1;
2497                 close(state->fd[1]);
2498                 state->fd[1] = -1;
2499                 talloc_free(state);
2500                 return -1;
2501         }
2502
2503         if (state->child == 0) {
2504                 char cc = RECLOCK_OK;
2505                 close(state->fd[0]);
2506                 state->fd[0] = -1;
2507
2508                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
2509                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
2510                         cc = RECLOCK_FAILED;
2511                 }
2512
2513                 write(state->fd[1], &cc, 1);
2514                 /* make sure we die when our parent dies */
2515                 while (kill(parent, 0) == 0 || errno != ESRCH) {
2516                         sleep(5);
2517                         write(state->fd[1], &cc, 1);
2518                 }
2519                 _exit(0);
2520         }
2521         close(state->fd[1]);
2522         state->fd[1] = -1;
2523         set_close_on_exec(state->fd[0]);
2524
2525         DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
2526
2527         talloc_set_destructor(state, check_reclock_destructor);
2528
2529         state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
2530                                     ctdb_check_reclock_timeout, state);
2531         if (state->te == NULL) {
2532                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
2533                 talloc_free(state);
2534                 return -1;
2535         }
2536
2537         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
2538                                 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
2539                                 reclock_child_handler,
2540                                 (void *)state);
2541
2542         if (state->fde == NULL) {
2543                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
2544                 talloc_free(state);
2545                 return -1;
2546         }
2547
2548         while (state->status == RECLOCK_CHECKING) {
2549                 event_loop_once(ctdb->ev);
2550         }
2551
2552         if (state->status == RECLOCK_FAILED) {
2553                 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
2554                 close(ctdb->recovery_lock_fd);
2555                 ctdb->recovery_lock_fd = -1;
2556                 talloc_free(state);
2557                 return -1;
2558         }
2559
2560         talloc_free(state);
2561         return 0;
2562 }
2563
2564 static int update_recovery_lock_file(struct ctdb_context *ctdb)
2565 {
2566         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
2567         const char *reclockfile;
2568
2569         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
2570                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
2571                 talloc_free(tmp_ctx);
2572                 return -1;
2573         }
2574
2575         if (reclockfile == NULL) {
2576                 if (ctdb->recovery_lock_file != NULL) {
2577                         DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
2578                         talloc_free(ctdb->recovery_lock_file);
2579                         ctdb->recovery_lock_file = NULL;
2580                         if (ctdb->recovery_lock_fd != -1) {
2581                                 close(ctdb->recovery_lock_fd);
2582                                 ctdb->recovery_lock_fd = -1;
2583                         }
2584                 }
2585                 ctdb->tunable.verify_recovery_lock = 0;
2586                 talloc_free(tmp_ctx);
2587                 return 0;
2588         }
2589
2590         if (ctdb->recovery_lock_file == NULL) {
2591                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2592                 if (ctdb->recovery_lock_fd != -1) {
2593                         close(ctdb->recovery_lock_fd);
2594                         ctdb->recovery_lock_fd = -1;
2595                 }
2596                 talloc_free(tmp_ctx);
2597                 return 0;
2598         }
2599
2600
2601         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
2602                 talloc_free(tmp_ctx);
2603                 return 0;
2604         }
2605
2606         talloc_free(ctdb->recovery_lock_file);
2607         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
2608         ctdb->tunable.verify_recovery_lock = 0;
2609         if (ctdb->recovery_lock_fd != -1) {
2610                 close(ctdb->recovery_lock_fd);
2611                 ctdb->recovery_lock_fd = -1;
2612         }
2613
2614         talloc_free(tmp_ctx);
2615         return 0;
2616 }
2617
2618 /*
2619   the main monitoring loop
2620  */
2621 static void monitor_cluster(struct ctdb_context *ctdb)
2622 {
2623         uint32_t pnn;
2624         TALLOC_CTX *mem_ctx=NULL;
2625         struct ctdb_node_map *nodemap=NULL;
2626         struct ctdb_node_map *recmaster_nodemap=NULL;
2627         struct ctdb_node_map **remote_nodemaps=NULL;
2628         struct ctdb_vnn_map *vnnmap=NULL;
2629         struct ctdb_vnn_map *remote_vnnmap=NULL;
2630         int32_t debug_level;
2631         int i, j, ret;
2632         struct ctdb_recoverd *rec;
2633
2634         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2635
2636         rec = talloc_zero(ctdb, struct ctdb_recoverd);
2637         CTDB_NO_MEMORY_FATAL(ctdb, rec);
2638
2639         rec->ctdb = ctdb;
2640
2641         rec->priority_time = timeval_current();
2642
2643         /* register a message port for sending memory dumps */
2644         ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2645
2646         /* register a message port for recovery elections */
2647         ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
2648
2649         /* when nodes are disabled/enabled */
2650         ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2651
2652         /* when we are asked to puch out a flag change */
2653         ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2654
2655         /* register a message port for vacuum fetch */
2656         ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
2657
2658         /* register a message port for reloadnodes  */
2659         ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2660
2661         /* register a message port for performing a takeover run */
2662         ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2663
2664         /* register a message port for disabling the ip check for a short while */
2665         ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2666
2667 again:
2668         if (mem_ctx) {
2669                 talloc_free(mem_ctx);
2670                 mem_ctx = NULL;
2671         }
2672         mem_ctx = talloc_new(ctdb);
2673         if (!mem_ctx) {
2674                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
2675                 exit(-1);
2676         }
2677
2678         /* we only check for recovery once every second */
2679         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
2680
2681         /* verify that the main daemon is still running */
2682         if (kill(ctdb->ctdbd_pid, 0) != 0) {
2683                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2684                 exit(-1);
2685         }
2686
2687         /* ping the local daemon to tell it we are alive */
2688         ctdb_ctrl_recd_ping(ctdb);
2689
2690         if (rec->election_timeout) {
2691                 /* an election is in progress */
2692                 goto again;
2693         }
2694
2695         /* read the debug level from the parent and update locally */
2696         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2697         if (ret !=0) {
2698                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2699                 goto again;
2700         }
2701         LogLevel = debug_level;
2702
2703
2704         /* We must check if we need to ban a node here but we want to do this
2705            as early as possible so we dont wait until we have pulled the node
2706            map from the local node. thats why we have the hardcoded value 20
2707         */
2708         for (i=0; i<ctdb->num_nodes; i++) {
2709                 struct ctdb_banning_state *ban_state;
2710
2711                 if (ctdb->nodes[i]->ban_state == NULL) {
2712                         continue;
2713                 }
2714                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
2715                 if (ban_state->count < 20) {
2716                         continue;
2717                 }
2718                 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
2719                         ctdb->nodes[i]->pnn, ban_state->count,
2720                         ctdb->tunable.recovery_ban_period));
2721                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
2722                 ban_state->count = 0;
2723         }
2724
2725         /* get relevant tunables */
2726         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2727         if (ret != 0) {
2728                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2729                 goto again;
2730         }
2731
2732         /* get the current recovery lock file from the server */
2733         if (update_recovery_lock_file(ctdb) != 0) {
2734                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
2735                 goto again;
2736         }
2737
2738         /* Make sure that if recovery lock verification becomes disabled when
2739            we close the file
2740         */
2741         if (ctdb->tunable.verify_recovery_lock == 0) {
2742                 if (ctdb->recovery_lock_fd != -1) {
2743                         close(ctdb->recovery_lock_fd);
2744                         ctdb->recovery_lock_fd = -1;
2745                 }
2746         }
2747
2748         pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
2749         if (pnn == (uint32_t)-1) {
2750                 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
2751                 goto again;
2752         }
2753
2754         /* get the vnnmap */
2755         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2756         if (ret != 0) {
2757                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2758                 goto again;
2759         }
2760
2761
2762         /* get number of nodes */
2763         if (rec->nodemap) {
2764                 talloc_free(rec->nodemap);
2765                 rec->nodemap = NULL;
2766                 nodemap=NULL;
2767         }
2768         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2769         if (ret != 0) {
2770                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2771                 goto again;
2772         }
2773         nodemap = rec->nodemap;
2774
2775         /* check which node is the recovery master */
2776         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
2777         if (ret != 0) {
2778                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
2779                 goto again;
2780         }
2781
2782         /* if we are not the recmaster we can safely ignore any ip reallocate requests */
2783         if (rec->recmaster != pnn) {
2784                 if (rec->ip_reallocate_ctx != NULL) {
2785                         talloc_free(rec->ip_reallocate_ctx);
2786                         rec->ip_reallocate_ctx = NULL;
2787                         rec->reallocate_callers = NULL;
2788                 }
2789         }
2790         /* if there are takeovers requested, perform it and notify the waiters */
2791         if (rec->reallocate_callers) {
2792                 process_ipreallocate_requests(ctdb, rec);
2793         }
2794
2795         if (rec->recmaster == (uint32_t)-1) {
2796                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
2797                 force_election(rec, pnn, nodemap);
2798                 goto again;
2799         }
2800
2801
2802         /* if the local daemon is STOPPED, we verify that the databases are
2803            also frozen and thet the recmode is set to active
2804         */
2805         if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
2806                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2807                 if (ret != 0) {
2808                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
2809                 }
2810                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2811                         DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
2812
2813                         ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
2814                         if (ret != 0) {
2815                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
2816                                 goto again;
2817                         }
2818                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2819                         if (ret != 0) {
2820                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
2821
2822                                 goto again;
2823                         }
2824                         goto again;
2825                 }
2826         }
2827         /* If the local node is stopped, verify we are not the recmaster
2828            and yield this role if so
2829         */
2830         if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
2831                 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
2832                 force_election(rec, pnn, nodemap);
2833                 goto again;
2834         }
2835
2836         /* check that we (recovery daemon) and the local ctdb daemon
2837            agrees on whether we are banned or not
2838         */
2839 //qqq
2840
2841         /* remember our own node flags */
2842         rec->node_flags = nodemap->nodes[pnn].flags;
2843
2844         /* count how many active nodes there are */
2845         rec->num_active    = 0;
2846         rec->num_connected = 0;
2847         for (i=0; i<nodemap->num; i++) {
2848                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2849                         rec->num_active++;
2850                 }
2851                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2852                         rec->num_connected++;
2853                 }
2854         }
2855
2856
2857         /* verify that the recmaster node is still active */
2858         for (j=0; j<nodemap->num; j++) {
2859                 if (nodemap->nodes[j].pnn==rec->recmaster) {
2860                         break;
2861                 }
2862         }
2863
2864         if (j == nodemap->num) {
2865                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
2866                 force_election(rec, pnn, nodemap);
2867                 goto again;
2868         }
2869
2870         /* if recovery master is disconnected we must elect a new recmaster */
2871         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
2872                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
2873                 force_election(rec, pnn, nodemap);
2874                 goto again;
2875         }
2876
2877         /* grap the nodemap from the recovery master to check if it is banned */
2878         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2879                                    mem_ctx, &recmaster_nodemap);
2880         if (ret != 0) {
2881                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
2882                           nodemap->nodes[j].pnn));
2883                 goto again;
2884         }
2885
2886
2887         if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2888                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
2889                 force_election(rec, pnn, nodemap);
2890                 goto again;
2891         }
2892
2893
2894         /* verify that we have all ip addresses we should have and we dont
2895          * have addresses we shouldnt have.
2896          */
2897         if (ctdb->do_checkpublicip) {
2898                 if (rec->ip_check_disable_ctx == NULL) {
2899                         if (verify_ip_allocation(ctdb, pnn) != 0) {
2900                                 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
2901                                 goto again;
2902                         }
2903                 }
2904         }
2905
2906
2907         /* if we are not the recmaster then we do not need to check
2908            if recovery is needed
2909          */
2910         if (pnn != rec->recmaster) {
2911                 goto again;
2912         }
2913
2914
2915         /* ensure our local copies of flags are right */
2916         ret = update_local_flags(rec, nodemap);
2917         if (ret == MONITOR_ELECTION_NEEDED) {
2918                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
2919                 force_election(rec, pnn, nodemap);
2920                 goto again;
2921         }
2922         if (ret != MONITOR_OK) {
2923                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2924                 goto again;
2925         }
2926
2927         /* update the list of public ips that a node can handle for
2928            all connected nodes
2929         */
2930         if (ctdb->num_nodes != nodemap->num) {
2931                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2932                 reload_nodes_file(ctdb);
2933                 goto again;
2934         }
2935         for (j=0; j<nodemap->num; j++) {
2936                 /* release any existing data */
2937                 if (ctdb->nodes[j]->public_ips) {
2938                         talloc_free(ctdb->nodes[j]->public_ips);
2939                         ctdb->nodes[j]->public_ips = NULL;
2940                 }
2941
2942                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2943                         continue;
2944                 }
2945
2946                 /* grab a new shiny list of public ips from the node */
2947                 if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
2948                         ctdb->nodes[j]->pnn,
2949                         ctdb->nodes,
2950                         &ctdb->nodes[j]->public_ips)) {
2951                         DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n",
2952                                 ctdb->nodes[j]->pnn));
2953                         goto again;
2954                 }
2955         }
2956
2957
2958         /* verify that all active nodes agree that we are the recmaster */
2959         switch (verify_recmaster(rec, nodemap, pnn)) {
2960         case MONITOR_RECOVERY_NEEDED:
2961                 /* can not happen */
2962                 goto again;
2963         case MONITOR_ELECTION_NEEDED:
2964                 force_election(rec, pnn, nodemap);
2965                 goto again;
2966         case MONITOR_OK:
2967                 break;
2968         case MONITOR_FAILED:
2969                 goto again;
2970         }
2971
2972
2973         if (rec->need_recovery) {
2974                 /* a previous recovery didn't finish */
2975                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2976                 goto again;
2977         }
2978
2979         /* verify that all active nodes are in normal mode
2980            and not in recovery mode
2981         */
2982         switch (verify_recmode(ctdb, nodemap)) {
2983         case MONITOR_RECOVERY_NEEDED:
2984                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2985                 goto again;
2986         case MONITOR_FAILED:
2987                 goto again;
2988         case MONITOR_ELECTION_NEEDED:
2989                 /* can not happen */
2990         case MONITOR_OK:
2991                 break;
2992         }
2993
2994
2995         if (ctdb->tunable.verify_recovery_lock != 0) {
2996                 /* we should have the reclock - check its not stale */
2997                 ret = check_recovery_lock(ctdb);
2998                 if (ret != 0) {
2999                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3000                         ctdb_set_culprit(rec, ctdb->pnn);
3001                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3002                         goto again;
3003                 }
3004         }
3005
3006         /* get the nodemap for all active remote nodes
3007          */
3008         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3009         if (remote_nodemaps == NULL) {
3010                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3011                 goto again;
3012         }
3013         for(i=0; i<nodemap->num; i++) {
3014                 remote_nodemaps[i] = NULL;
3015         }
3016         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3017                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3018                 goto again;
3019         }
3020
3021         /* verify that all other nodes have the same nodemap as we have
3022         */
3023         for (j=0; j<nodemap->num; j++) {
3024                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3025                         continue;
3026                 }
3027
3028                 if (remote_nodemaps[j] == NULL) {
3029                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3030                         ctdb_set_culprit(rec, j);
3031
3032                         goto again;
3033                 }
3034
3035                 /* if the nodes disagree on how many nodes there are
3036                    then this is a good reason to try recovery
3037                  */
3038                 if (remote_nodemaps[j]->num != nodemap->num) {
3039                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3040                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3041                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3042                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3043                         goto again;
3044                 }
3045
3046                 /* if the nodes disagree on which nodes exist and are
3047                    active, then that is also a good reason to do recovery
3048                  */
3049                 for (i=0;i<nodemap->num;i++) {
3050                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3051                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3052                                           nodemap->nodes[j].pnn, i,
3053                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3054                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3055                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3056                                             vnnmap);
3057                                 goto again;
3058                         }
3059                 }
3060
3061                 /* verify the flags are consistent
3062                 */
3063                 for (i=0; i<nodemap->num; i++) {
3064                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3065                                 continue;
3066                         }
3067
3068                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3069                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3070                                   nodemap->nodes[j].pnn,
3071                                   nodemap->nodes[i].pnn,
3072                                   remote_nodemaps[j]->nodes[i].flags,
3073                                   nodemap->nodes[j].flags));
3074                                 if (i == j) {
3075                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3076                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3077                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3078                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3079                                                     vnnmap);
3080                                         goto again;
3081                                 } else {
3082                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3083                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3084                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3085                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3086                                                     vnnmap);
3087                                         goto again;
3088                                 }
3089                         }
3090                 }
3091         }
3092
3093
3094         /* there better be the same number of lmasters in the vnn map
3095            as there are active nodes or we will have to do a recovery
3096          */
3097         if (vnnmap->size != rec->num_active) {
3098                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3099                           vnnmap->size, rec->num_active));
3100                 ctdb_set_culprit(rec, ctdb->pnn);
3101                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3102                 goto again;
3103         }
3104
3105         /* verify that all active nodes in the nodemap also exist in
3106            the vnnmap.
3107          */
3108         for (j=0; j<nodemap->num; j++) {
3109                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3110                         continue;
3111                 }
3112                 if (nodemap->nodes[j].pnn == pnn) {
3113                         continue;
3114                 }
3115
3116                 for (i=0; i<vnnmap->size; i++) {
3117                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3118                                 break;
3119                         }
3120                 }
3121                 if (i == vnnmap->size) {
3122                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3123                                   nodemap->nodes[j].pnn));
3124                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3125                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3126                         goto again;
3127                 }
3128         }
3129
3130
3131         /* verify that all other nodes have the same vnnmap
3132            and are from the same generation
3133          */
3134         for (j=0; j<nodemap->num; j++) {
3135                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3136                         continue;
3137                 }
3138                 if (nodemap->nodes[j].pnn == pnn) {
3139                         continue;
3140                 }
3141
3142                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3143                                           mem_ctx, &remote_vnnmap);
3144                 if (ret != 0) {
3145                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3146                                   nodemap->nodes[j].pnn));
3147                         goto again;
3148                 }
3149
3150                 /* verify the vnnmap generation is the same */
3151                 if (vnnmap->generation != remote_vnnmap->generation) {
3152                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3153                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3154                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3155                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3156                         goto again;
3157                 }
3158
3159                 /* verify the vnnmap size is the same */
3160                 if (vnnmap->size != remote_vnnmap->size) {
3161                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3162                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3163                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3164                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3165                         goto again;
3166                 }
3167
3168                 /* verify the vnnmap is the same */
3169                 for (i=0;i<vnnmap->size;i++) {
3170                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3171                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3172                                           nodemap->nodes[j].pnn));
3173                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3174                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3175                                             vnnmap);
3176                                 goto again;
3177                         }
3178                 }
3179         }
3180
3181         /* we might need to change who has what IP assigned */
3182         if (rec->need_takeover_run) {
3183                 rec->need_takeover_run = false;
3184
3185                 /* execute the "startrecovery" event script on all nodes */
3186                 ret = run_startrecovery_eventscript(rec, nodemap);
3187                 if (ret!=0) {
3188                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3189                         ctdb_set_culprit(rec, ctdb->pnn);
3190                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3191                 }
3192
3193                 ret = ctdb_takeover_run(ctdb, nodemap);
3194                 if (ret != 0) {
3195                         DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
3196                         ctdb_set_culprit(rec, ctdb->pnn);
3197                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3198                 }
3199
3200                 /* execute the "recovered" event script on all nodes */
3201                 ret = run_recovered_eventscript(ctdb, nodemap, "monitor_cluster");
3202 #if 0
3203 // we cant check whether the event completed successfully
3204 // since this script WILL fail if the node is in recovery mode
3205 // and if that race happens, the code here would just cause a second
3206 // cascading recovery.
3207                 if (ret!=0) {
3208                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3209                         ctdb_set_culprit(rec, ctdb->pnn);
3210                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3211                 }
3212 #endif
3213         }
3214
3215
3216         goto again;
3217
3218 }
3219
3220 /*
3221   event handler for when the main ctdbd dies
3222  */
3223 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3224                                  uint16_t flags, void *private_data)
3225 {
3226         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3227         _exit(1);
3228 }
3229
3230 /*
3231   called regularly to verify that the recovery daemon is still running
3232  */
3233 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3234                               struct timeval yt, void *p)
3235 {
3236         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3237
3238         if (kill(ctdb->recoverd_pid, 0) != 0) {
3239                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
3240
3241                 ctdb_stop_recoverd(ctdb);
3242                 ctdb_stop_keepalive(ctdb);
3243                 ctdb_stop_monitoring(ctdb);
3244                 ctdb_release_all_ips(ctdb);
3245                 if (ctdb->methods != NULL) {
3246                         ctdb->methods->shutdown(ctdb);
3247                 }
3248                 ctdb_event_script(ctdb, "shutdown");
3249
3250                 exit(10);
3251         }
3252
3253         event_add_timed(ctdb->ev, ctdb,
3254                         timeval_current_ofs(30, 0),
3255                         ctdb_check_recd, ctdb);
3256 }
3257
3258 static void recd_sig_child_handler(struct event_context *ev,
3259         struct signal_event *se, int signum, int count,
3260         void *dont_care,
3261         void *private_data)
3262 {
3263 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3264         int status;
3265         pid_t pid = -1;
3266
3267         while (pid != 0) {
3268                 pid = waitpid(-1, &status, WNOHANG);
3269                 if (pid == -1) {
3270                         if (errno != ECHILD) {
3271                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3272                         }
3273                         return;
3274                 }
3275                 if (pid > 0) {
3276                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3277                 }
3278         }
3279 }
3280
3281 /*
3282   startup the recovery daemon as a child of the main ctdb daemon
3283  */
3284 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3285 {
3286         int fd[2];
3287         struct signal_event *se;
3288
3289         if (pipe(fd) != 0) {
3290                 return -1;
3291         }
3292
3293         ctdb->ctdbd_pid = getpid();
3294
3295         ctdb->recoverd_pid = fork();
3296         if (ctdb->recoverd_pid == -1) {
3297                 return -1;
3298         }
3299
3300         if (ctdb->recoverd_pid != 0) {
3301                 close(fd[0]);
3302                 event_add_timed(ctdb->ev, ctdb,
3303                                 timeval_current_ofs(30, 0),
3304                                 ctdb_check_recd, ctdb);
3305                 return 0;
3306         }
3307
3308         close(fd[1]);
3309
3310         srandom(getpid() ^ time(NULL));
3311
3312         if (switch_from_server_to_client(ctdb) != 0) {
3313                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3314                 exit(1);
3315         }
3316
3317         DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3318
3319         event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
3320                      ctdb_recoverd_parent, &fd[0]);
3321
3322         /* set up a handler to pick up sigchld */
3323         se = event_add_signal(ctdb->ev, ctdb,
3324                                      SIGCHLD, 0,
3325                                      recd_sig_child_handler,
3326                                      ctdb);
3327         if (se == NULL) {
3328                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3329                 exit(1);
3330         }
3331
3332         monitor_cluster(ctdb);
3333
3334         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3335         return -1;
3336 }
3337
3338 /*
3339   shutdown the recovery daemon
3340  */
3341 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3342 {
3343         if (ctdb->recoverd_pid == 0) {
3344                 return;
3345         }
3346
3347         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3348         kill(ctdb->recoverd_pid, SIGTERM);
3349 }