do persistent writes in a child process
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Wed, 28 May 2008 03:04:25 +0000 (13:04 +1000)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Wed, 28 May 2008 03:04:25 +0000 (13:04 +1000)
(This used to be ctdb commit 2da3d1f876f5d654f849af8a3e588f5a61300c3d)

ctdb/common/ctdb_ltdb.c
ctdb/include/ctdb_private.h
ctdb/server/ctdb_persistent.c
ctdb/tools/ctdb.c

index 5272927ffca2cf4575379087bb1241743cba0069..a6bf26817aae9a6142d3076acd88010ad453c2a0 100644 (file)
@@ -160,7 +160,7 @@ int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key,
 
 /*
   write a record to a persistent database
-  at this stage the the record is locked by a lockwait child.
+  this is done by a child process
 */
 int ctdb_ltdb_persistent_store(struct ctdb_db_context *ctdb_db, TDB_DATA key, 
                    struct ctdb_ltdb_header *header, TDB_DATA data)
@@ -189,7 +189,7 @@ int ctdb_ltdb_persistent_store(struct ctdb_db_context *ctdb_db, TDB_DATA key,
 
        /* if this is a persistent database without NOSYNC then we
           will do this via a transaction */
-       if (0 && !(ctdb_db->client_tdb_flags & TDB_NOSYNC)) {
+       if (!(ctdb_db->client_tdb_flags & TDB_NOSYNC)) {
                ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
                if (ret != 0) {
                        DEBUG(DEBUG_ERR, (__location__ " Failed to start local transaction\n"));
index 758b506c65cf3fe28bd0eeb8d18e4a0490967b42..2d595ff3a2f4c7d61aee4e8c30bee8a53b80d9f3 100644 (file)
@@ -306,11 +306,14 @@ struct ctdb_statistics {
        uint32_t pending_calls;
        uint32_t lockwait_calls;
        uint32_t pending_lockwait_calls;
+       uint32_t childwrite_calls;
+       uint32_t pending_childwrite_calls;
        uint32_t memory_used;
        uint32_t __last_counter; /* hack for control_statistics_all */
        uint32_t max_hop_count;
        double max_call_latency;
        double max_lockwait_latency;
+       double max_childwrite_latency;
 };
 
 
index 6a7a3eb466263214516d0ee5a43bf630a14c069c..c68ef67980b4817495c14a8b161a389492f5ca37 100644 (file)
@@ -136,7 +136,7 @@ int32_t ctdb_control_persistent_store(struct ctdb_context *ctdb,
 }
 
 
-struct ctdb_persistent_lock_state {
+struct ctdb_persistent_write_state {
        struct ctdb_db_context *ctdb_db;
        TDB_DATA key;
        TDB_DATA data;
@@ -147,9 +147,9 @@ struct ctdb_persistent_lock_state {
 
 
 /*
-  called with a lock held by a lockwait child
+  called from a child process to write the data
  */
-static int ctdb_persistent_store(struct ctdb_persistent_lock_state *state)
+static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
 {
        struct ctdb_ltdb_header oldheader;
        int ret;
@@ -181,25 +181,16 @@ static int ctdb_persistent_store(struct ctdb_persistent_lock_state *state)
 
 
 /*
-  called when we get the lock on the given record
-  at this point the lockwait child holds a lock on our behalf
+  called when we the child has completed the persistent write
+  on our behalf
  */
-static void ctdb_persistent_lock_callback(void *private_data)
+static void ctdb_persistent_write_callback(int status, void *private_data)
 {
-       struct ctdb_persistent_lock_state *state = talloc_get_type(private_data, 
-                                                                  struct ctdb_persistent_lock_state);
-       int ret;
+       struct ctdb_persistent_write_state *state = talloc_get_type(private_data, 
+                                                                  struct ctdb_persistent_write_state);
 
-       ret = tdb_chainlock_mark(state->tdb, state->key);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,("Failed to mark lock in ctdb_persistent_lock_callback\n"));
-               ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, ret, NULL);
-               return;
-       }
 
-       ret = ctdb_persistent_store(state);
-       ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, ret, NULL);
-       tdb_chainlock_unmark(state->tdb, state->key);
+       ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
 
        talloc_free(state);
 }
@@ -210,12 +201,155 @@ static void ctdb_persistent_lock_callback(void *private_data)
 static void ctdb_persistent_lock_timeout(struct event_context *ev, struct timed_event *te, 
                                         struct timeval t, void *private_data)
 {
-       struct ctdb_persistent_lock_state *state = talloc_get_type(private_data, 
-                                                                  struct ctdb_persistent_lock_state);
+       struct ctdb_persistent_write_state *state = talloc_get_type(private_data, 
+                                                                  struct ctdb_persistent_write_state);
        ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
        talloc_free(state);
 }
 
+struct childwrite_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct fd_event *fde;
+       int fd[2];
+       pid_t child;
+       void *private_data;
+       void (*callback)(int, void *);
+       TDB_DATA key;
+       TDB_DATA data;
+       struct timeval start_time;
+};
+
+static int childwrite_destructor(struct childwrite_handle *h)
+{
+       h->ctdb->statistics.pending_childwrite_calls--;
+       kill(h->child, SIGKILL);
+       waitpid(h->child, NULL, 0);
+       return 0;
+}
+
+/* called when the child process has finished writing the record to the
+   database
+*/
+static void childwrite_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct childwrite_handle *h = talloc_get_type(private_data, 
+                                                    struct childwrite_handle);
+       void *p = h->private_data;
+       void (*callback)(int, void *) = h->callback;
+       pid_t child = h->child;
+       TALLOC_CTX *tmp_ctx = talloc_new(ev);
+       int ret;
+       char c;
+
+       ctdb_latency(&h->ctdb->statistics.max_childwrite_latency, h->start_time);
+       h->ctdb->statistics.pending_childwrite_calls--;
+
+       /* the handle needs to go away when the context is gone - when
+          the handle goes away this implicitly closes the pipe, which
+          kills the child */
+       talloc_steal(tmp_ctx, h);
+
+       talloc_set_destructor(h, NULL);
+
+       ret = read(h->fd[0], &c, 1);
+       if (ret < 1) {
+               DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
+               c = 1;
+       }
+
+       /* XXX we need to pass state back here.? */
+       callback(c, p);
+
+       kill(child, SIGKILL);
+       waitpid(child, NULL, 0);
+       talloc_free(tmp_ctx);
+}
+
+/* this creates a child process which will take out a tdb transaction
+   and write the record to the database.
+*/
+struct childwrite_handle *ctdb_childwrite(struct ctdb_db_context *ctdb_db,
+                               TDB_DATA key,
+                               TDB_DATA data,
+                               void (*callback)(int, void *private_data),
+                               void *private_data)
+{
+       struct childwrite_handle *result;
+       int ret;
+       pid_t parent = getpid();
+
+       ctdb_db->ctdb->statistics.childwrite_calls++;
+       ctdb_db->ctdb->statistics.pending_childwrite_calls++;
+
+       if (!(result = talloc_zero(private_data, struct childwrite_handle))) {
+               ctdb_db->ctdb->statistics.pending_childwrite_calls--;
+               return NULL;
+       }
+
+       ret = pipe(result->fd);
+
+       if (ret != 0) {
+               talloc_free(result);
+               ctdb_db->ctdb->statistics.pending_childwrite_calls--;
+               return NULL;
+       }
+
+       result->child = fork();
+
+       if (result->child == (pid_t)-1) {
+               close(result->fd[0]);
+               close(result->fd[1]);
+               talloc_free(result);
+               ctdb_db->ctdb->statistics.pending_childwrite_calls--;
+               return NULL;
+       }
+
+       result->callback = callback;
+       result->private_data = private_data;
+       result->ctdb = ctdb_db->ctdb;
+       result->ctdb_db = ctdb_db;
+       result->key = key;
+       result->data = data;
+
+       if (result->child == 0) {
+               char c = 0;
+               struct ctdb_persistent_write_state *state = talloc_get_type(private_data, struct ctdb_persistent_write_state);
+
+               close(result->fd[0]);
+               ret = ctdb_persistent_store(state);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
+                       c = 1;
+               }
+
+               write(result->fd[1], &c, 1);
+
+               /* make sure we die when our parent dies */
+               while (kill(parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       close(result->fd[1]);
+       talloc_set_destructor(result, childwrite_destructor);
+
+       result->fde = event_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
+                                  EVENT_FD_READ|EVENT_FD_AUTOCLOSE, childwrite_handler,
+                                  (void *)result);
+       if (result->fde == NULL) {
+               talloc_free(result);
+               ctdb_db->ctdb->statistics.pending_childwrite_calls--;
+               return NULL;
+       }
+
+       result->start_time = timeval_current();
+
+       return result;
+}
+
 /* 
    update a record on this node if the new record has a higher rsn than the
    current record
@@ -227,8 +361,8 @@ int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
        struct ctdb_rec_data *rec = (struct ctdb_rec_data *)&recdata.dptr[0];
        struct ctdb_db_context *ctdb_db;
        uint32_t db_id = rec->reqid;
-       struct ctdb_persistent_lock_state *state;
-       struct lockwait_handle *handle;
+       struct ctdb_persistent_write_state *state;
+       struct childwrite_handle *handle;
 
        if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
                DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_update_record when recovery active\n"));
@@ -241,7 +375,7 @@ int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
                return -1;
        }
 
-       state = talloc(ctdb, struct ctdb_persistent_lock_state);
+       state = talloc(ctdb, struct ctdb_persistent_write_state);
        CTDB_NO_MEMORY(ctdb, state);
 
        state->ctdb_db = ctdb_db;
@@ -263,26 +397,13 @@ int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
        state->data.dptr  += sizeof(struct ctdb_ltdb_header);
        state->data.dsize -= sizeof(struct ctdb_ltdb_header);
 
-#if 0
-       /* We can not take out a lock here ourself since if this persistent
-          database needs safe transaction writes we can not be holding
-          a lock on the database.
-          Therefore we always create a lock wait child to take out and hold
-          the lock for us.
-       */
-       ret = tdb_chainlock_nonblock(state->tdb, state->key);
-       if (ret == 0) {
-               ret = ctdb_persistent_store(state);
-               tdb_chainunlock(state->tdb, state->key);
-               talloc_free(state);
-               return ret;
-       }
-#endif
 
-       /* wait until we have a lock on this record */
-       handle = ctdb_lockwait(ctdb_db, state->key, ctdb_persistent_lock_callback, state);
+       /* create a child process to take out a transaction and 
+          write the data.
+       */
+       handle = ctdb_childwrite(ctdb_db, state->key, state->data, ctdb_persistent_write_callback, state);
        if (handle == NULL) {
-               DEBUG(DEBUG_ERR,("Failed to setup lockwait handler in ctdb_control_update_record\n"));
+               DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
                talloc_free(state);
                return -1;
        }
index a9839d984571978a214a0c9b524bf4f331ebdc1c..151179a8e2122ed97c2692e028a4125ae4d097ae 100644 (file)
@@ -117,6 +117,8 @@ static void show_statistics(struct ctdb_statistics *s)
                STATISTICS_FIELD(pending_calls),
                STATISTICS_FIELD(lockwait_calls),
                STATISTICS_FIELD(pending_lockwait_calls),
+               STATISTICS_FIELD(childwrite_calls),
+               STATISTICS_FIELD(pending_childwrite_calls),
                STATISTICS_FIELD(memory_used),
                STATISTICS_FIELD(max_hop_count),
        };
@@ -139,6 +141,7 @@ static void show_statistics(struct ctdb_statistics *s)
        }
        printf(" %-30s     %.6f sec\n", "max_call_latency", s->max_call_latency);
        printf(" %-30s     %.6f sec\n", "max_lockwait_latency", s->max_lockwait_latency);
+       printf(" %-30s     %.6f sec\n", "max_childwrite_latency", s->max_childwrite_latency);
        talloc_free(tmp_ctx);
 }