s3: Avoid a thundering herd in g_lock_unlock

[abartlet/samba.git/.git] / source3 / lib / g_lock.c
diff --git a/source3/lib/g_lock.c b/source3/lib/g_lock.c

index 0eae3f2131ec551a2da2251cb8de9b21e4d5000c..512c0680d9f00992defc58705dbfde4d4e5948c1 100644 (file)
--- a/source3/lib/g_lock.c
+++ b/source3/lib/g_lock.c
@@ -108,6 +108,34 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
                               (locks[i].lock_type & G_LOCK_PENDING) ?
                               "(pending)" : "(owner)"));
  
+               if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
+                   && !process_exists(locks[i].pid)) {
+
+                       DEBUGADD(10, ("lock owner %s died -- discarding\n",
+                                     procid_str(talloc_tos(),
+                                                &locks[i].pid)));
+
+                       if (i < (num_locks-1)) {
+                               locks[i] = locks[num_locks-1];
+                       }
+                       num_locks -= 1;
+               }
+       }
+
+       *plocks = locks;
+       *pnum_locks = num_locks;
+       return true;
+}
+
+static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
+{
+       int i, num_locks;
+
+       num_locks = *pnum_locks;
+
+       DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
+
+       for (i=0; i<num_locks; i++) {
                 if (process_exists(locks[i].pid)) {
                         continue;
                 }
@@ -119,19 +147,18 @@ static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
                 }
                 num_locks -= 1;
         }
-
-       *plocks = locks;
         *pnum_locks = num_locks;
-       return true;
+       return;
  }
  
  static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
                                         struct g_lock_rec *locks,
-                                       int num_locks,
+                                       int *pnum_locks,
                                         const struct server_id pid,
                                         enum g_lock_type lock_type)
  {
         struct g_lock_rec *result;
+       int num_locks = *pnum_locks;
  
         result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
                                 num_locks+1);
@@ -141,6 +168,7 @@ static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
  
         result[num_locks].pid = pid;
         result[num_locks].lock_type = lock_type;
+       *pnum_locks += 1;
         return result;
  }
  
@@ -149,10 +177,6 @@ static void g_lock_got_retry(struct messaging_context *msg,
                              uint32_t msg_type,
                              struct server_id server_id,
                              DATA_BLOB *data);
-static void g_lock_timedout(struct tevent_context *ev,
-                           struct tevent_timer *te,
-                           struct timeval current_time,
-                           void *private_data);
  
  static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
                                enum g_lock_type lock_type)
@@ -225,7 +249,7 @@ again:
         if (our_index == -1) {
                 /* First round, add ourself */
  
-               locks = g_lock_addrec(talloc_tos(), locks, num_locks,
+               locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
                                       self, lock_type);
                 if (locks == NULL) {
                         DEBUG(10, ("g_lock_addrec failed\n"));
@@ -241,7 +265,14 @@ again:
                 locks[our_index].lock_type = lock_type;
         }
  
-       data = make_tdb_data((uint8_t *)locks, talloc_get_size(locks));
+       if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
+               /*
+                * Walk through the list of locks, search for dead entries
+                */
+               g_lock_cleanup(&num_locks, locks);
+       }
+
+       data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
         store_status = rec->store(rec, data, 0);
         if (!NT_STATUS_IS_OK(store_status)) {
                 DEBUG(1, ("rec->store failed: %s\n",
@@ -266,7 +297,9 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
         struct tevent_timer *te = NULL;
         NTSTATUS status;
         bool retry = false;
-       bool timedout = false;
+       struct timeval timeout_end;
+       struct timeval timeout_remaining;
+       struct timeval time_now;
  
         DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
                    name));
@@ -295,53 +328,118 @@ NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
                            nt_errstr(status)));
                 return status;
         }
-again:
-       retry = false;
  
-       status = g_lock_trylock(ctx, name, lock_type);
-       if (NT_STATUS_IS_OK(status)) {
-               DEBUG(10, ("Got lock %s\n", name));
-               goto done;
-       }
-       if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
-               DEBUG(10, ("g_lock_trylock failed: %s\n",
-                          nt_errstr(status)));
-               goto done;
-       }
+       time_now = timeval_current();
+       timeout_end = timeval_sum(&time_now, &timeout);
  
-       DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
+       while (true) {
+#ifdef CLUSTER_SUPPORT
+               fd_set _r_fds;
+#endif
+               fd_set *r_fds = NULL;
+               int max_fd = 0;
+               int ret;
  
-       if (te == NULL) {
-               te = tevent_add_timer(
-                       ctx->msg->event_ctx, talloc_tos(),
-                       timeval_current_ofs(timeout.tv_sec, timeout.tv_usec),
-                       g_lock_timedout, &timedout);
-               if (te == NULL) {
-                       DEBUG(10, ("tevent_add_timer failed\n"));
-                       status = NT_STATUS_NO_MEMORY;
-                       goto done;
+               status = g_lock_trylock(ctx, name, lock_type);
+               if (NT_STATUS_IS_OK(status)) {
+                       DEBUG(10, ("Got lock %s\n", name));
+                       break;
                 }
-       }
-
-       while (true) {
-               if (tevent_loop_once(ctx->msg->event_ctx) == -1) {
-                       DEBUG(1, ("tevent_loop_once failed\n"));
-                       status = NT_STATUS_INTERNAL_ERROR;
-                       goto done;
+               if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
+                       DEBUG(10, ("g_lock_trylock failed: %s\n",
+                                  nt_errstr(status)));
+                       break;
                 }
-               if (retry) {
-                       goto again;
+
+               DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
+
+               /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                *             !!! HACK ALERT --- FIX ME !!!
+                * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                * What we really want to do here is to react to
+                * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
+                * by a client doing g_lock_unlock or by ourselves when
+                * we receive a CTDB_SRVID_SAMBA_NOTIFY or
+                * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
+                * either a client holding a lock or a complete node
+                * has died.
+                *
+                * Doing this properly involves calling tevent_loop_once(),
+                * but doing this here with the main ctdbd messaging context
+                * creates a nested event loop when g_lock_lock() is called
+                * from the main event loop, e.g. in a tcon_and_X where the
+                * share_info.tdb needs to be initialized and is locked by
+                * another process, or when the remore registry is accessed
+                * for writing and some other process already holds a lock
+                * on the registry.tdb.
+                *
+                * So as a quick fix, we act a little coarsely here: we do
+                * a select on the ctdb connection fd and when it is readable
+                * or we get EINTR, then we retry without actually parsing
+                * any ctdb packages or dispatching messages. This means that
+                * we retry more often than intended by design, but this does
+                * not harm and it is unobtrusive. When we have finished,
+                * the main loop will pick up all the messages and ctdb
+                * packets. The only extra twist is that we cannot use timed
+                * events here but have to handcode a timeout.
+                */
+
+#ifdef CLUSTER_SUPPORT
+               if (lp_clustering()) {
+                       struct ctdbd_connection *conn = messaging_ctdbd_connection();
+
+                       r_fds = &_r_fds;
+                       FD_ZERO(r_fds);
+                       max_fd = ctdbd_conn_get_fd(conn);
+                       FD_SET(max_fd, r_fds);
                 }
-               if (timedout) {
-                       DEBUG(10, ("g_lock_lock timed out\n"));
+#endif
  
-                       te = NULL;
+               time_now = timeval_current();
+               timeout_remaining = timeval_until(&time_now, &timeout_end);
  
-                       status = NT_STATUS_LOCK_NOT_GRANTED;
-                       goto done;
+               ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
+                                &timeout_remaining);
+
+               if (ret == -1) {
+                       if (errno != EINTR) {
+                               DEBUG(1, ("error calling select: %s\n",
+                                         strerror(errno)));
+                               status = NT_STATUS_INTERNAL_ERROR;
+                               break;
+                       }
+                       /*
+                        * errno == EINTR:
+                        * This means a signal was received.
+                        * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
+                        * ==> retry
+                        */
+               } else if (ret == 0) {
+                       if (timeval_expired(&timeout_end)) {
+                               DEBUG(10, ("g_lock_lock timed out\n"));
+                               status = NT_STATUS_LOCK_NOT_GRANTED;
+                               break;
+                       } else {
+                               DEBUG(10, ("select returned 0 but timeout not "
+                                          "not expired: strange - retrying\n"));
+                       }
+               } else if (ret != 1) {
+                       DEBUG(1, ("invalid return code of select: %d\n", ret));
+                       status = NT_STATUS_INTERNAL_ERROR;
+                       break;
                 }
+               /*
+                * ret == 1:
+                * This means ctdbd has sent us some data.
+                * Might be a CTDB_SRVID_RECONFIGURE or a
+                * CTDB_SRVID_SAMBA_NOTIFY message.
+                * ==> retry
+                */
         }
+
+#ifdef CLUSTER_SUPPORT
  done:
+#endif
  
         if (!NT_STATUS_IS_OK(status)) {
                 NTSTATUS unlock_status;
@@ -374,16 +472,6 @@ static void g_lock_got_retry(struct messaging_context *msg,
         *pretry = true;
  }
  
-static void g_lock_timedout(struct tevent_context *ev,
-                           struct tevent_timer *te,
-                           struct timeval current_time,
-                           void *private_data)
-{
-       bool *ptimedout = (bool *)private_data;
-       *ptimedout = true;
-       TALLOC_FREE(te);
-}
-
  static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
                                     struct server_id pid)
  {
@@ -442,13 +530,23 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
         }
  
         if ((lock_type & G_LOCK_PENDING) == 0) {
+               int num_wakeups = 0;
+
                 /*
-                * We've been the lock holder. Tell all others to retry.
+                * We've been the lock holder. Others to retry. Don't
+                * tell all others to avoid a thundering herd. In case
+                * this leads to a complete stall because we miss some
+                * processes, the loop in g_lock_lock tries at least
+                * once a minute.
                  */
+
                 for (i=0; i<num_locks; i++) {
                         if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
                                 continue;
                         }
+                       if (!process_exists(locks[i].pid)) {
+                               continue;
+                       }
  
                         /*
                          * Ping all waiters to retry
@@ -461,6 +559,11 @@ static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
                                           procid_str(talloc_tos(),
                                                      &locks[i].pid),
                                           nt_errstr(status)));
+                       } else {
+                               num_wakeups += 1;
+                       }
+                       if (num_wakeups > 5) {
+                               break;
                         }
                 }
         }