ctdb-recovery: Reimplement ctdb_recovery_lock() using ctdb_cluster_mutex()
authorMartin Schwenke <martin@meltin.net>
Tue, 19 Jan 2016 09:33:58 +0000 (20:33 +1100)
committerAmitay Isaacs <amitay@samba.org>
Thu, 28 Apr 2016 07:39:16 +0000 (09:39 +0200)
Replace the file descriptor for the recovery lock in the CTDB context
with the cluster mutex handle, where non-NULL means locked.
Attempting to take the recovery lock is now asynchronous and no longer
blocks the recovery daemon.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/include/ctdb_private.h
ctdb/server/ctdb_recover.c
ctdb/server/ctdbd.c

index 41b9f4f6ce92838302e8bc893639e317b5e48df4..edd451b2c1f106882f4b846664ad7f4facc9f8e4 100644 (file)
@@ -280,6 +280,8 @@ struct ctdb_daemon_data {
        }
 
 
+struct ctdb_cluster_mutex_handle;
+
 enum ctdb_freeze_mode {CTDB_FREEZE_NONE, CTDB_FREEZE_PENDING, CTDB_FREEZE_FROZEN};
 
 #define NUM_DB_PRIORITIES 3
@@ -309,7 +311,7 @@ struct ctdb_context {
        uint64_t max_persistent_check_errors;
        const char *transport;
        char *recovery_lock_file;
-       int recovery_lock_fd;
+       struct ctdb_cluster_mutex_handle *recovery_lock_handle;
        uint32_t pnn; /* our own pnn */
        uint32_t num_nodes;
        uint32_t num_connected;
index 8314388930c329895a10fe5e5532efb863c568b1..f58f9a6da2ad176286af2a574818b612bce49042 100644 (file)
@@ -1137,60 +1137,73 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 
 bool ctdb_recovery_have_lock(struct ctdb_context *ctdb)
 {
-       return ctdb->recovery_lock_fd != -1;
+       return (ctdb->recovery_lock_handle != NULL);
 }
 
-/*
-  try and get the recovery lock in shared storage - should only work
-  on the recovery master recovery daemon. Anywhere else is a bug
- */
-bool ctdb_recovery_lock(struct ctdb_context *ctdb)
+struct hold_reclock_state {
+       bool done;
+       char status;
+};
+
+static void hold_reclock_handler(struct ctdb_context *ctdb,
+                                char status,
+                                double latency,
+                                struct ctdb_cluster_mutex_handle *h,
+                                void *private_data)
 {
-       struct flock lock;
+       struct hold_reclock_state *s =
+               (struct hold_reclock_state *) private_data;
+
+       switch (status) {
+       case '0':
+               ctdb->recovery_lock_handle = h;
+               break;
 
-       ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file,
-                                     O_RDWR|O_CREAT, 0600);
-       if (ctdb->recovery_lock_fd == -1) {
+       case '1':
                DEBUG(DEBUG_ERR,
-                     ("ctdb_recovery_lock: Unable to open %s - (%s)\n",
-                      ctdb->recovery_lock_file, strerror(errno)));
-               return false;
+                     ("Unable to take recovery lock - contention\n"));
+               talloc_free(h);
+               break;
+
+       default:
+               DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
+               talloc_free(h);
        }
 
-       set_close_on_exec(ctdb->recovery_lock_fd);
-
-       lock.l_type = F_WRLCK;
-       lock.l_whence = SEEK_SET;
-       lock.l_start = 0;
-       lock.l_len = 1;
-       lock.l_pid = 0;
-
-       if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
-               int saved_errno = errno;
-               close(ctdb->recovery_lock_fd);
-               ctdb->recovery_lock_fd = -1;
-               /* Fail silently on these errors, since they indicate
-                * lock contention, but log an error for any other
-                * failure. */
-               if (saved_errno != EACCES &&
-                   saved_errno != EAGAIN) {
-                       DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Failed to get "
-                                        "recovery lock on '%s' - (%s)\n",
-                                        ctdb->recovery_lock_file,
-                                        strerror(saved_errno)));
-               }
-               return false;
+       s->done = true;
+       s->status = status;
+}
+
+bool ctdb_recovery_lock(struct ctdb_context *ctdb)
+{
+       struct ctdb_cluster_mutex_handle *h;
+       struct hold_reclock_state s = {
+               .done = false,
+               .status = '0',
+       };
+
+       h = ctdb_cluster_mutex(ctdb, 0);
+       if (h == NULL) {
+               return -1;
        }
 
-       return true;
+       h->handler = hold_reclock_handler;
+       h->private_data = &s;
+
+       while (!s.done) {
+               tevent_loop_once(ctdb->ev);
+       }
+
+       h->private_data = NULL;
+
+       return (s.status == '0');
 }
 
 void ctdb_recovery_unlock(struct ctdb_context *ctdb)
 {
-       if (ctdb->recovery_lock_fd != -1) {
+       if (ctdb->recovery_lock_handle != NULL) {
                DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
-               close(ctdb->recovery_lock_fd);
-               ctdb->recovery_lock_fd = -1;
+               TALLOC_FREE(ctdb->recovery_lock_handle);
        }
 }
 
index 5fc1db6318d99d9df1a6d99c7616c3b5d93e2528..5d6b4be36cdf6959cd740434d77662b2f168e2eb 100644 (file)
@@ -195,7 +195,7 @@ int main(int argc, const char *argv[])
        ctdb->recovery_mode    = CTDB_RECOVERY_NORMAL;
        ctdb->recovery_master  = (uint32_t)-1;
        ctdb->upcalls          = &ctdb_upcalls;
-       ctdb->recovery_lock_fd = -1;
+       ctdb->recovery_lock_handle = NULL;
 
        TALLOC_FREE(ctdb->idr);
        ret = reqid_init(ctdb, 0, &ctdb->idr);;