this is a demonstration of an idea for handling locks in ctdb.
authorAndrew Tridgell <tridge@samba.org>
Mon, 16 Apr 2007 06:01:37 +0000 (16:01 +1000)
committerAndrew Tridgell <tridge@samba.org>
Mon, 16 Apr 2007 06:01:37 +0000 (16:01 +1000)
The problem we have is this:

  - we want the client smbd processes to be able to 'shortcut' access
    to the ltdb, by directly accessing the ltdb, and if the header of
    the record shows we are the dmaster then process immediately, with
    no overhead of talking across the unix domain socket

  - a client doing a shortcut will use tdb_chainlock() to lock the
    record while processing

  - we want the main ctdb daemon to be able to set locks on the
    record, and when those locks collide with a 'shortcut' fcntl lock,
    we want the ctdb daemon to keep processing other operations

  - we don't want to have to send a message from a smbd client to the
    ctdbd each time it releases a lock

The solution is shown in this example. Note that the expensive fork()
and blocking lock is only paid in case of contention, so in the median
case I think this is zero cost.

Makefile.in
common/util.c
config.mk
include/includes.h
tests/lockwait.c [new file with mode: 0644]

index 175418aa99dcc17310686b58ac9eb69d09a8a822..93e0c7d2bdc7116ae5549f22296d32fe0bb6b623 100644 (file)
@@ -30,7 +30,7 @@ CTDB_OBJ = $(CTDB_COMMON_OBJ) $(CTDB_TCP_OBJ)
 
 OBJS = @TDBOBJ@ @TALLOCOBJ@ @LIBREPLACEOBJ@ @INFINIBAND_WRAPPER_OBJ@ $(EXTRA_OBJ) $(EVENTS_OBJ) $(CTDB_OBJ)
 
-BINS = bin/ctdbd bin/ctdbd_test bin/ctdb_test bin/ctdb_bench bin/ctdb_messaging bin/ctdb_fetch bin/ctdb_fetch1 @INFINIBAND_BINS@
+BINS = bin/ctdbd bin/ctdbd_test bin/ctdb_test bin/ctdb_bench bin/ctdb_messaging bin/ctdb_fetch bin/ctdb_fetch1 bin/lockwait @INFINIBAND_BINS@
 
 DIRS = lib bin
 
@@ -81,6 +81,10 @@ bin/ibwrapper_test: $(OBJS) ib/ibwrapper_test.o
        @echo Linking $@
        @$(CC) $(CFLAGS) -o $@ ib/ibwrapper_test.o $(OBJS) $(LIB_FLAGS)
 
+bin/lockwait: $(OBJS) tests/lockwait.o tests/cmdline.o
+       @echo Linking $@
+       @$(CC) $(CFLAGS) -o $@ tests/lockwait.o tests/cmdline.o $(OBJS) $(LIB_FLAGS)
+
 clean:
        rm -f *.o */*.o */*/*.o
        rm -f $(BINS)
index a44c7d0ad07ac20758e7f03fa71a5cc39dcea088..4d0b25117a7143359dbb93135dc2331f9fcc6f8c 100644 (file)
@@ -37,6 +37,13 @@ struct timeval timeval_current(void)
        return tv;
 }
 
+double timeval_elapsed(struct timeval *tv)
+{
+       struct timeval tv2 = timeval_current();
+       return (tv2.tv_sec - tv->tv_sec) + 
+              (tv2.tv_usec - tv->tv_usec)*1.0e-6;
+}
+
 /**
   return a timeval struct with the given elements
 */
index 0e0629bfb1f16ada319d9f0c95c58bdb6ec8a186..a16b7b2991949bd9093fd0ae556c27b11cb44074 100644 (file)
--- a/config.mk
+++ b/config.mk
@@ -15,6 +15,7 @@ OBJ_FILES = \
 
 ##################
 [SUBSYSTEM::ctdb]
+INIT_FUNCTION = server_service_ctdbd_init
 OBJ_FILES = \
                ctdb_cluster.o \
                common/ctdb.o \
index 994c25452c8f86fa5e108a7084e464abfd899266..e55ddc2c6b867a23254bbd68a121ffa5542a4f78 100644 (file)
@@ -32,5 +32,5 @@ int timeval_compare(const struct timeval *tv1, const struct timeval *tv2);
 struct timeval timeval_until(const struct timeval *tv1,
                             const struct timeval *tv2);
 _PUBLIC_ struct timeval timeval_current_ofs(uint32_t secs, uint32_t usecs);
+double timeval_elapsed(struct timeval *tv);
 char **file_lines_load(const char *fname, int *numlines, TALLOC_CTX *mem_ctx);
-
diff --git a/tests/lockwait.c b/tests/lockwait.c
new file mode 100644 (file)
index 0000000..2c95bb7
--- /dev/null
@@ -0,0 +1,244 @@
+/* 
+   test a lock wait idea
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "popt.h"
+#include "tests/cmdline.h"
+
+
+struct lockwait_handle {
+       struct fd_event *fde;
+       int fd[2];
+       pid_t child;
+       void *private_data;
+       void (*callback)(void *);
+};
+
+static void lockwait_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct lockwait_handle *h = talloc_get_type(private_data, 
+                                                    struct lockwait_handle);
+       void (*callback)(void *) = h->callback;
+       void *p = h->private_data;
+       talloc_set_destructor(h, NULL);
+       talloc_free(h); 
+       callback(p);
+       waitpid(h->child, NULL, 0);
+}
+
+static int lockwait_destructor(struct lockwait_handle *h)
+{
+       close(h->fd[0]);
+       kill(h->child, SIGKILL);
+       waitpid(h->child, NULL, 0);
+       return 0;
+}
+
+
+static struct lockwait_handle *lockwait(struct event_context *ev, 
+                                       TALLOC_CTX *mem_ctx,
+                                       int fd, off_t ofs, size_t len,
+                                       void (*callback)(void *), void *private_data)
+{
+       struct lockwait_handle *h;
+       int ret;
+
+       h = talloc_zero(mem_ctx, struct lockwait_handle);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       ret = pipe(h->fd);
+       if (ret != 0) {
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->child = fork();
+       if (h->child == (pid_t)-1) {
+               close(h->fd[0]);
+               close(h->fd[1]);
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->callback = callback;
+       h->private_data = private_data;
+
+       if (h->child == 0) {
+               /* in child */
+               struct flock lock;
+               close(h->fd[0]);
+               lock.l_type = F_WRLCK;
+               lock.l_whence = SEEK_SET;
+               lock.l_start = ofs;
+               lock.l_len = len;
+               lock.l_pid = 0;
+               fcntl(fd,F_SETLKW,&lock);
+               _exit(0);
+       }
+
+       close(h->fd[1]);
+       talloc_set_destructor(h, lockwait_destructor);
+
+       h->fde = event_add_fd(ev, h, fd, EVENT_FD_READ, lockwait_handler, h);
+       if (h->fde == NULL) {
+               talloc_free(h);
+               return NULL;
+       }
+
+       return h;
+}
+
+
+
+
+static void fcntl_lock_callback(void *p)
+{
+       int *got_lock = (int *)p;
+       *got_lock = 1;
+}
+
+/*
+  get an fcntl lock - waiting if necessary
+ */
+static int fcntl_lock(struct event_context *ev,
+                     int fd, int op, off_t offset, off_t count, int type)
+{
+       struct flock lock;
+       int ret;
+       int use_lockwait = (op == F_SETLKW);
+       int got_lock = 0;
+
+       lock.l_type = type;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = offset;
+       lock.l_len = count;
+       lock.l_pid = 0;
+
+       do {
+               ret = fcntl(fd,use_lockwait?F_SETLK:op,&lock);
+               if (ret == 0) {
+                       return 0;
+               }
+               if (ret == -1 && 
+                   (errno == EACCES || errno == EAGAIN || errno == EDEADLK)) {
+                       struct lockwait_handle *h;
+                       h = lockwait(ev, ev, fd, offset, count, 
+                                    fcntl_lock_callback, &got_lock);
+                       if (h == NULL) {
+                               errno = ENOLCK;
+                               return -1;
+                       }
+                       /* in real code we would return to the event loop */
+                       while (!got_lock) {
+                               event_loop_once(ev);
+                       }
+                       got_lock = 0;
+               }
+       } while (!got_lock);
+
+       return ret;
+}
+
+static void child(struct event_context *ev, int n)
+{
+       int fd;
+       int count=0;
+       struct timeval tv;
+       fd = open("test.dat", O_CREAT|O_RDWR, 0666);
+       if (fd == -1) {
+               perror("test.dat");
+               exit(1);
+       }
+
+       tv = timeval_current();
+
+       while (timeval_elapsed(&tv) < 10) {
+               int ret;
+               ret = fcntl_lock(ev, fd, F_SETLKW, 0, 1, F_WRLCK);
+               if (ret != 0) {
+                       printf("Failed to get lock in child %d!\n", n);
+                       break;
+               }
+               fcntl_lock(ev, fd, F_SETLK, 0, 1, F_UNLCK);
+               count++;
+       }
+
+       printf("child %2d %.0f ops/sec\n", n, count/timeval_elapsed(&tv));
+       _exit(0);
+}
+
+static int timelimit = 10;
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       pid_t *pids;
+       int nprogs = 2;
+       int i;
+       struct event_context *ev;
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               { "num-progs", 'n', POPT_ARG_INT, &nprogs, 0, "num_progs", "integer" },
+               POPT_TABLEEND
+       };
+       poptContext pc;
+       int opt;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       ev = event_context_init(NULL);
+
+       pids = talloc_array(ev, pid_t, nprogs);
+
+       /* create N processes fighting over the same lock */
+       for (i=0;i<nprogs;i++) {
+               pids[i] = fork();
+               if (pids[i] == 0) {
+                       child(ev, i);
+               }
+       }
+
+       printf("Waiting for %d children ...\n", nprogs);
+
+       /* wait for our kids to finish playing */
+       for (i=0;i<nprogs;i++) {
+               waitpid(pids[i], NULL, 0);
+       }       
+
+       return 0;
+}