freeze: abort vacuuming when we're going to freeze.
authorRusty Russell <rusty@rustcorp.com.au>
Wed, 21 Jul 2010 02:59:55 +0000 (12:29 +0930)
committerRusty Russell <rusty@rustcorp.com.au>
Wed, 18 Aug 2010 01:24:28 +0000 (10:54 +0930)
There are some reports of freeze timeouts, and it looks like vacuuming might
be the culprit.  So we add code to tell them to abort when a freeze is
going on.

(This is based on the 1.0.112 branch version 517f05e42f, but far
 simpler since tdb is now robust against processes being killed during
 transaction commit)

CQ:S1018154 & S1018349
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
include/ctdb_private.h
server/ctdb_freeze.c
server/ctdb_vacuum.c

index ca3d613f5bc65533d4ee98ea052551c79847febb..cd387c5393030a98e0c0ce624374c506cddc7dec 100644 (file)
@@ -456,6 +456,8 @@ struct ctdb_context {
 
        TALLOC_CTX *banning_ctx;
 
+       struct ctdb_vacuum_child_context *vacuumers;
+
        /* mapping from pid to ctdb_client * */
        struct ctdb_client_pid_list *client_pids;
 
@@ -1312,6 +1314,7 @@ int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval
 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply);
 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb);
 
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb);
 int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db);
 
 int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata);
index e641ef3ae68c5e756250416441630e8b153b1b6d..4e977589e1efdbad35cd5cd3f87b5d40cca90353 100644 (file)
@@ -272,6 +272,9 @@ int ctdb_start_freeze(struct ctdb_context *ctdb, uint32_t priority)
                return 0;
        }
 
+       /* Stop any vacuuming going on: we don't want to wait. */
+       ctdb_stop_vacuuming(ctdb);
+
        /* if there isn't a freeze lock child then create one */
        if (ctdb->freeze_handles[priority] == NULL) {
                ctdb->freeze_handles[priority] = ctdb_freeze_lock(ctdb, priority);
index f1e61dbf115483f1540a23db57578514d5b7d4a0..17afd79afde343ad81ef97a3a9b62713bc3fa074 100644 (file)
@@ -36,7 +36,9 @@
 enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT};
 
 struct ctdb_vacuum_child_context {
+       struct ctdb_vacuum_child_context *next, *prev;
        struct ctdb_vacuum_handle *vacuum_handle;
+       /* fd child writes status to */
        int fd[2];
        pid_t child_pid;
        enum vacuum_child_status status;
@@ -743,6 +745,8 @@ static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx)
                kill(child_ctx->child_pid, SIGKILL);
        }
 
+       DLIST_REMOVE(ctdb->vacuumers, child_ctx);
+
        event_add_timed(ctdb->ev, child_ctx->vacuum_handle,
                        timeval_current_ofs(get_vacuum_interval(ctdb_db), 0), 
                        ctdb_vacuum_event, child_ctx->vacuum_handle);
@@ -861,6 +865,7 @@ ctdb_vacuum_event(struct event_context *ev, struct timed_event *te,
        child_ctx->status = VACUUM_RUNNING;
        child_ctx->start_time = timeval_current();
 
+       DLIST_ADD(ctdb->vacuumers, child_ctx);
        talloc_set_destructor(child_ctx, vacuum_child_destructor);
 
        event_add_timed(ctdb->ev, child_ctx,
@@ -878,6 +883,17 @@ ctdb_vacuum_event(struct event_context *ev, struct timed_event *te,
        child_ctx->vacuum_handle = vacuum_handle;
 }
 
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb)
+{
+       /* Simply free them all. */
+       while (ctdb->vacuumers) {
+               DEBUG(DEBUG_INFO, ("Aborting vacuuming for %s (%p)\n",
+                          ctdb->vacuumers->vacuum_handle->ctdb_db->db_name,
+                          ctdb->vacuumers->child_pid));
+               /* vacuum_child_destructor kills it, removes from list */
+               talloc_free(ctdb->vacuumers);
+       }
+}
 
 /* this function initializes the vacuuming context for a database
  * starts the vacuuming events