ctdb-recoverd: Improve election win messages
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_recover.c
index c6d01153ccc149ba2476f731f2ad98957fc5d820..3be4840b462e9b75ac90d33a57cc5dee046a4977 100644 (file)
@@ -40,6 +40,8 @@
 #include "common/common.h"
 #include "common/logging.h"
 
+#include "ctdb_cluster_mutex.h"
+
 int 
 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
 {
@@ -742,42 +744,23 @@ int32_t ctdb_control_db_push_confirm(struct ctdb_context *ctdb,
        outdata->dsize = sizeof(uint32_t);
 
        talloc_free(state);
+       ctdb_db->push_started = false;
        ctdb_db->push_state = NULL;
 
        return 0;
 }
 
-struct ctdb_cluster_mutex_handle;
-typedef void (*cluster_mutex_handler_t) (
-       struct ctdb_context *ctdb,
-       char status,
-       double latency,
-       struct ctdb_cluster_mutex_handle *h,
-       void *private_data);
-
-struct ctdb_cluster_mutex_handle {
+struct set_recmode_state {
        struct ctdb_context *ctdb;
-       cluster_mutex_handler_t handler;
-       void *private_data;
-       int fd[2];
-       struct tevent_timer *te;
-       struct tevent_fd *fde;
-       pid_t child;
-       struct timeval start_time;
+       struct ctdb_req_control_old *c;
 };
 
-static void set_recmode_handler(struct ctdb_context *ctdb,
-                               char status,
+static void set_recmode_handler(char status,
                                double latency,
-                               struct ctdb_cluster_mutex_handle *h,
                                void *private_data)
 {
-       /* It would be good to use talloc_get_type() here.  However,
-        * the name of the packet is manually set - not sure why.
-        * Could use talloc_check_name() but this seems like a lot of
-        * manual overkill. */
-       struct ctdb_req_control_old *c =
-               (struct ctdb_req_control_old *) private_data;
+       struct set_recmode_state *state = talloc_get_type_abort(
+               private_data, struct set_recmode_state);
        int s = 0;
        const char *err = NULL;
 
@@ -786,7 +769,7 @@ static void set_recmode_handler(struct ctdb_context *ctdb,
                /* Mutex taken */
                DEBUG(DEBUG_ERR,
                      ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
-                      ctdb->recovery_lock_file));
+                      state->ctdb->recovery_lock));
                s = -1;
                err = "Took recovery lock from daemon during recovery - probably a cluster filesystem lock coherence problem";
                break;
@@ -794,12 +777,12 @@ static void set_recmode_handler(struct ctdb_context *ctdb,
        case '1':
                /* Contention */
                DEBUG(DEBUG_DEBUG, (__location__ " Recovery lock check OK\n"));
-               ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
-               ctdb_process_deferred_attach(ctdb);
+               state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+               ctdb_process_deferred_attach(state->ctdb);
 
                s = 0;
 
-               CTDB_UPDATE_RECLOCK_LATENCY(ctdb, "daemon reclock",
+               CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock",
                                            reclock.ctdbd, latency);
                break;
 
@@ -812,8 +795,8 @@ static void set_recmode_handler(struct ctdb_context *ctdb,
                DEBUG(DEBUG_WARNING,
                      (__location__
                       "Time out getting recovery lock, allowing recmode set anyway\n"));
-               ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
-               ctdb_process_deferred_attach(ctdb);
+               state->ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
+               ctdb_process_deferred_attach(state->ctdb);
 
                s = 0;
                break;
@@ -825,67 +808,8 @@ static void set_recmode_handler(struct ctdb_context *ctdb,
                err = "Unexpected error when testing recovery lock";
        }
 
-       ctdb_request_control_reply(ctdb, c, NULL, s, err);
-       talloc_free(h);
-}
-
-/*
-  called if our set_recmode child times out. this would happen if
-  ctdb_recovery_lock() would block.
- */
-static void cluster_mutex_timeout(struct tevent_context *ev,
-                                 struct tevent_timer *te,
-                                 struct timeval t, void *private_data)
-{
-       struct ctdb_cluster_mutex_handle *h =
-               talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
-       double latency = timeval_elapsed(&h->start_time);
-
-       if (h->handler != NULL) {
-               h->handler(h->ctdb, '2', latency, h, h->private_data);
-       }
-}
-
-
-/* When the handle is freed it causes any child holding the mutex to
- * be killed, thus freeing the mutex */
-static int cluster_mutex_destructor(struct ctdb_cluster_mutex_handle *h)
-{
-       if (h->fd[0] != -1) {
-               h->fd[0] = -1;
-       }
-       ctdb_kill(h->ctdb, h->child, SIGKILL);
-       return 0;
-}
-
-/* this is called when the client process has completed ctdb_recovery_lock()
-   and has written data back to us through the pipe.
-*/
-static void cluster_mutex_handler(struct tevent_context *ev,
-                                 struct tevent_fd *fde,
-                                 uint16_t flags, void *private_data)
-{
-       struct ctdb_cluster_mutex_handle *h=
-               talloc_get_type(private_data, struct ctdb_cluster_mutex_handle);
-       double latency = timeval_elapsed(&h->start_time);
-       char c = '0';
-       int ret;
-
-       /* we got a response from our child process so we can abort the
-          timeout.
-       */
-       talloc_free(h->te);
-       h->te = NULL;
-
-       ret = sys_read(h->fd[0], &c, 1);
-
-       /* If the child wrote status then just pass it to the handler.
-        * If no status was written then this is an unexpected error
-        * so pass generic error code to handler. */
-       if (h->handler != NULL) {
-               h->handler(h->ctdb, ret == 1 ? c : '3', latency,
-                          h, h->private_data);
-       }
+       ctdb_request_control_reply(state->ctdb, state->c, NULL, s, err);
+       talloc_free(state);
 }
 
 static void
@@ -901,92 +825,6 @@ ctdb_drop_all_ips_event(struct tevent_context *ev, struct tevent_timer *te,
        ctdb_release_all_ips(ctdb);
 }
 
-static struct ctdb_cluster_mutex_handle *
-ctdb_cluster_mutex(struct ctdb_context *ctdb)
-{
-       struct ctdb_cluster_mutex_handle *h;
-       pid_t parent = getpid();
-       int ret;
-
-       h = talloc(ctdb, struct ctdb_cluster_mutex_handle);
-       if (h == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
-               return NULL;
-       }
-
-       h->start_time = timeval_current();
-       h->fd[0] = -1;
-       h->fd[1] = -1;
-
-       /* For the rest of what needs to be done, we need to do this in
-          a child process since
-          1, the call to ctdb_recovery_lock() can block if the cluster
-             filesystem is in the process of recovery.
-       */
-       ret = pipe(h->fd);
-       if (ret != 0) {
-               talloc_free(h);
-               DEBUG(DEBUG_ERR, (__location__ " Failed to open pipe\n"));
-               return NULL;
-       }
-
-       h->child = ctdb_fork(ctdb);
-       if (h->child == (pid_t)-1) {
-               close(h->fd[0]);
-               close(h->fd[1]);
-               talloc_free(h);
-               return NULL;
-       }
-
-       if (h->child == 0) {
-               char cc = '1';
-               close(h->fd[0]);
-
-               prctl_set_comment("ctdb_cluster_mutex");
-               debug_extra = talloc_asprintf(NULL, "cluster_mutex:");
-               /* Daemon should not be able to get the recover lock,
-                * as it should be held by the recovery master */
-               if (ctdb_recovery_lock(ctdb)) {
-                       DEBUG(DEBUG_ERR,
-                             ("ERROR: Daemon able to take recovery lock on \"%s\" during recovery\n",
-                              ctdb->recovery_lock_file));
-                       ctdb_recovery_unlock(ctdb);
-                       cc = '0';
-               }
-
-               sys_write(h->fd[1], &cc, 1);
-               ctdb_wait_for_process_to_exit(parent);
-               _exit(0);
-       }
-
-       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d\n", h->fd[0]));
-       set_close_on_exec(h->fd[0]);
-
-       close(h->fd[1]);
-       h->fd[1] = -1;
-
-       talloc_set_destructor(h, cluster_mutex_destructor);
-
-       h->te = tevent_add_timer(ctdb->ev, h, timeval_current_ofs(5, 0),
-                                cluster_mutex_timeout, h);
-
-       h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
-                              cluster_mutex_handler, (void *)h);
-
-       if (h->fde == NULL) {
-               talloc_free(h);
-               return NULL;
-       }
-       tevent_fd_set_auto_close(h->fde);
-
-       h->ctdb = ctdb;
-       h->handler = NULL;
-       h->private_data = NULL;
-
-       return h;
-}
-
-
 /*
  * Set up an event to drop all public ips if we remain in recovery for too
  * long
@@ -1016,6 +854,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
        uint32_t recmode = *(uint32_t *)indata.dptr;
        int i;
        struct ctdb_db_context *ctdb_db;
+       struct set_recmode_state *state;
        struct ctdb_cluster_mutex_handle *h;
 
        /* if we enter recovery but stay in recovery for too long
@@ -1063,91 +902,35 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
                }
        }
 
-       /* release any deferred attach calls from clients */
-       if (recmode == CTDB_RECOVERY_NORMAL) {
+       if (ctdb->recovery_lock == NULL) {
+               /* Not using recovery lock file */
+               ctdb->recovery_mode = CTDB_RECOVERY_NORMAL;
                ctdb_process_deferred_attach(ctdb);
+               return 0;
        }
 
-       if (ctdb->recovery_lock_file == NULL) {
-               /* Not using recovery lock file */
-               ctdb->recovery_mode = recmode;
-               return 0;
+       state = talloc_zero(ctdb, struct set_recmode_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return -1;
        }
+       state->ctdb = ctdb;
+       state->c = NULL;
 
-       h = ctdb_cluster_mutex(ctdb);
+       h = ctdb_cluster_mutex(state, ctdb, ctdb->recovery_lock, 5,
+                              set_recmode_handler, state, NULL, NULL);
        if (h == NULL) {
+               talloc_free(state);
                return -1;
        }
 
-       /* set_recmode_handler() frees h */
-       h->handler = set_recmode_handler;
-       h->private_data = talloc_steal(h, c);
-
+       state->c = talloc_steal(state, c);
        *async_reply = true;
 
        return 0;
 }
 
 
-bool ctdb_recovery_have_lock(struct ctdb_context *ctdb)
-{
-       return ctdb->recovery_lock_fd != -1;
-}
-
-/*
-  try and get the recovery lock in shared storage - should only work
-  on the recovery master recovery daemon. Anywhere else is a bug
- */
-bool ctdb_recovery_lock(struct ctdb_context *ctdb)
-{
-       struct flock lock;
-
-       ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file,
-                                     O_RDWR|O_CREAT, 0600);
-       if (ctdb->recovery_lock_fd == -1) {
-               DEBUG(DEBUG_ERR,
-                     ("ctdb_recovery_lock: Unable to open %s - (%s)\n",
-                      ctdb->recovery_lock_file, strerror(errno)));
-               return false;
-       }
-
-       set_close_on_exec(ctdb->recovery_lock_fd);
-
-       lock.l_type = F_WRLCK;
-       lock.l_whence = SEEK_SET;
-       lock.l_start = 0;
-       lock.l_len = 1;
-       lock.l_pid = 0;
-
-       if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
-               int saved_errno = errno;
-               close(ctdb->recovery_lock_fd);
-               ctdb->recovery_lock_fd = -1;
-               /* Fail silently on these errors, since they indicate
-                * lock contention, but log an error for any other
-                * failure. */
-               if (saved_errno != EACCES &&
-                   saved_errno != EAGAIN) {
-                       DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Failed to get "
-                                        "recovery lock on '%s' - (%s)\n",
-                                        ctdb->recovery_lock_file,
-                                        strerror(saved_errno)));
-               }
-               return false;
-       }
-
-       return true;
-}
-
-void ctdb_recovery_unlock(struct ctdb_context *ctdb)
-{
-       if (ctdb->recovery_lock_fd != -1) {
-               DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
-               close(ctdb->recovery_lock_fd);
-               ctdb->recovery_lock_fd = -1;
-       }
-}
-
 /*
   delete a record as part of the vacuum process
   only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
@@ -1349,40 +1132,123 @@ static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status,
        talloc_free(state);
 }
 
-/*
-  run the startrecovery eventscript
- */
-int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, 
-                               struct ctdb_req_control_old *c,
-                               bool *async_reply)
+static void run_start_recovery_event(struct ctdb_context *ctdb,
+                                    struct recovery_callback_state *state)
 {
        int ret;
-       struct recovery_callback_state *state;
-
-       DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
-       gettimeofday(&ctdb->last_recovery_started, NULL);
-
-       state = talloc(ctdb, struct recovery_callback_state);
-       CTDB_NO_MEMORY(ctdb, state);
-
-       state->c    = talloc_steal(state, c);
 
        ctdb_disable_monitoring(ctdb);
 
        ret = ctdb_event_script_callback(ctdb, state,
-                                        ctdb_start_recovery_callback, 
+                                        ctdb_start_recovery_callback,
                                         state,
                                         CTDB_EVENT_START_RECOVERY,
                                         "%s", "");
 
        if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
+               DEBUG(DEBUG_ERR,("Unable to run startrecovery event\n"));
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
                talloc_free(state);
-               return -1;
+               return;
+       }
+
+       return;
+}
+
+static bool reclock_strings_equal(const char *a, const char *b)
+{
+       return (a == NULL && b == NULL) ||
+               (a != NULL && b != NULL && strcmp(a, b) == 0);
+}
+
+static void start_recovery_reclock_callback(struct ctdb_context *ctdb,
+                                               int32_t status,
+                                               TDB_DATA data,
+                                               const char *errormsg,
+                                               void *private_data)
+{
+       struct recovery_callback_state *state = talloc_get_type_abort(
+               private_data, struct recovery_callback_state);
+       const char *local = ctdb->recovery_lock;
+       const char *remote = NULL;
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+               ctdb_request_control_reply(ctdb, state->c, NULL,
+                                          status, errormsg);
+               talloc_free(state);
+               return;
+       }
+
+       /* Check reclock consistency */
+       if (data.dsize > 0) {
+               /* Ensure NUL-termination */
+               data.dptr[data.dsize-1] = '\0';
+               remote = (const char *)data.dptr;
+       }
+       if (! reclock_strings_equal(local, remote)) {
+               /* Inconsistent */
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               DEBUG(DEBUG_ERR,
+                     ("Recovery lock configuration inconsistent: "
+                      "recmaster has %s, this node has %s, shutting down\n",
+                      remote == NULL ? "NULL" : remote,
+                      local == NULL ? "NULL" : local));
+               talloc_free(state);
+               ctdb_shutdown_sequence(ctdb, 1);
+       }
+       DEBUG(DEBUG_INFO,
+             ("Recovery lock consistency check successful\n"));
+
+       run_start_recovery_event(ctdb, state);
+}
+
+/* Check recovery lock consistency and run eventscripts for the
+ * "startrecovery" event */
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
+                                   struct ctdb_req_control_old *c,
+                                   bool *async_reply)
+{
+       int ret;
+       struct recovery_callback_state *state;
+       uint32_t recmaster = c->hdr.srcnode;
+
+       DEBUG(DEBUG_NOTICE, ("Running startrecovery event\n"));
+       gettimeofday(&ctdb->last_recovery_started, NULL);
+
+       state = talloc(ctdb, struct recovery_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = c;
+
+       /* Although the recovery master sent this node a start
+        * recovery control, this node might still think the recovery
+        * master is disconnected.  In this case defer the recovery
+        * lock consistency check. */
+       if (ctdb->nodes[recmaster]->flags & NODE_FLAGS_DISCONNECTED) {
+               run_start_recovery_event(ctdb, state);
+       } else {
+               /* Ask the recovery master about its reclock setting */
+               ret = ctdb_daemon_send_control(ctdb,
+                                              recmaster,
+                                              0,
+                                              CTDB_CONTROL_GET_RECLOCK_FILE,
+                                              0, 0,
+                                              tdb_null,
+                                              start_recovery_reclock_callback,
+                                              state);
+
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " GET_RECLOCK failed\n"));
+                       talloc_free(state);
+                       return -1;
+               }
        }
 
        /* tell the control that we will be reply asynchronously */
+       state->c = talloc_steal(state, c);
        *async_reply = true;
+
        return 0;
 }
 
@@ -1740,12 +1606,14 @@ int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, T
 
        if (ctdb->pnn != new_recmaster && ctdb->recovery_master == ctdb->pnn) {
                DEBUG(DEBUG_NOTICE,
-                     ("This node (%u) is no longer the recovery master\n", ctdb->pnn));
+                     ("Remote node (%u) is now the recovery master\n",
+                      new_recmaster));
        }
 
        if (ctdb->pnn == new_recmaster && ctdb->recovery_master != new_recmaster) {
                DEBUG(DEBUG_NOTICE,
-                     ("This node (%u) is now the recovery master\n", ctdb->pnn));
+                     ("This node (%u) is now the recovery master\n",
+                      ctdb->pnn));
        }
 
        ctdb->recovery_master = new_recmaster;