example libctdb.a and test program libctdb/tst.c
[sahlberg/ctdb.git] / client / ctdb_client.c
index 2c86b3e8441a2a6be49373a2da07a33691fe231b..665f61d2c42dfa728dba2402137fcb2cd3984341 100644 (file)
 #include "../include/ctdb_private.h"
 #include "lib/util/dlinklist.h"
 
-/*
-  allocate a packet for use in client<->daemon communication
- */
-struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
-                                           TALLOC_CTX *mem_ctx, 
-                                           enum ctdb_operation operation, 
-                                           size_t length, size_t slength,
-                                           const char *type)
-{
-       int size;
-       struct ctdb_req_header *hdr;
-
-       length = MAX(length, slength);
-       size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
-
-       hdr = (struct ctdb_req_header *)talloc_size(mem_ctx, size);
-       if (hdr == NULL) {
-               DEBUG(DEBUG_ERR,("Unable to allocate packet for operation %u of length %u\n",
-                        operation, (unsigned)length));
-               return NULL;
-       }
-       talloc_set_name_const(hdr, type);
-       memset(hdr, 0, slength);
-       hdr->length       = length;
-       hdr->operation    = operation;
-       hdr->ctdb_magic   = CTDB_MAGIC;
-       hdr->ctdb_version = CTDB_VERSION;
-       hdr->srcnode      = ctdb->pnn;
-       if (ctdb->vnn_map) {
-               hdr->generation = ctdb->vnn_map->generation;
-       }
-
-       return hdr;
-}
-
-/*
-  local version of ctdb_call
-*/
-int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
-                   struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
-                   TDB_DATA *data, uint32_t caller)
-{
-       struct ctdb_call_info *c;
-       struct ctdb_registered_call *fn;
-       struct ctdb_context *ctdb = ctdb_db->ctdb;
-       
-       c = talloc(ctdb, struct ctdb_call_info);
-       CTDB_NO_MEMORY(ctdb, c);
-
-       c->key = call->key;
-       c->call_data = &call->call_data;
-       c->record_data.dptr = talloc_memdup(c, data->dptr, data->dsize);
-       c->record_data.dsize = data->dsize;
-       CTDB_NO_MEMORY(ctdb, c->record_data.dptr);
-       c->new_data = NULL;
-       c->reply_data = NULL;
-       c->status = 0;
-
-       for (fn=ctdb_db->calls;fn;fn=fn->next) {
-               if (fn->id == call->call_id) break;
-       }
-       if (fn == NULL) {
-               ctdb_set_error(ctdb, "Unknown call id %u\n", call->call_id);
-               talloc_free(c);
-               return -1;
-       }
-
-       if (fn->fn(c) != 0) {
-               ctdb_set_error(ctdb, "ctdb_call %u failed\n", call->call_id);
-               talloc_free(c);
-               return -1;
-       }
-
-       if (header->laccessor != caller) {
-               header->lacount = 0;
-       }
-       header->laccessor = caller;
-       header->lacount++;
-
-       /* we need to force the record to be written out if this was a remote access,
-          so that the lacount is updated */
-       if (c->new_data == NULL && header->laccessor != ctdb->pnn) {
-               c->new_data = &c->record_data;
-       }
-
-       if (c->new_data) {
-               /* XXX check that we always have the lock here? */
-               if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
-                       ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
-                       talloc_free(c);
-                       return -1;
-               }
-       }
-
-       if (c->reply_data) {
-               call->reply_data = *c->reply_data;
-
-               talloc_steal(call, call->reply_data.dptr);
-               talloc_set_name_const(call->reply_data.dptr, __location__);
-       } else {
-               call->reply_data.dptr = NULL;
-               call->reply_data.dsize = 0;
-       }
-       call->status = c->status;
-
-       talloc_free(c);
-
-       return 0;
-}
-
-
-/*
-  queue a packet for sending from client to daemon
-*/
-static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
-{
-       return ctdb_queue_send(ctdb->daemon.queue, (uint8_t *)hdr, hdr->length);
-}
-
-
-/*
-  called when a CTDB_REPLY_CALL packet comes in in the client
-
-  This packet comes in response to a CTDB_REQ_CALL request packet. It
-  contains any reply data from the call
-*/
-static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
-{
-       struct ctdb_reply_call *c = (struct ctdb_reply_call *)hdr;
-       struct ctdb_client_call_state *state;
-
-       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_client_call_state);
-       if (state == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
-               return;
-       }
-
-       if (hdr->reqid != state->reqid) {
-               /* we found a record  but it was the wrong one */
-               DEBUG(DEBUG_ERR, ("Dropped client call reply with reqid:%u\n",hdr->reqid));
-               return;
-       }
-
-       state->call->reply_data.dptr = c->data;
-       state->call->reply_data.dsize = c->datalen;
-       state->call->status = c->status;
-
-       talloc_steal(state, c);
-
-       state->state = CTDB_CALL_DONE;
-
-       if (state->async.fn) {
-               state->async.fn(state);
-       }
-}
-
-static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
-
-/*
-  this is called in the client, when data comes in from the daemon
- */
-static void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
-{
-       struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context);
-       struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
-       TALLOC_CTX *tmp_ctx;
-
-       /* place the packet as a child of a tmp_ctx. We then use
-          talloc_free() below to free it. If any of the calls want
-          to keep it, then they will steal it somewhere else, and the
-          talloc_free() will be a no-op */
-       tmp_ctx = talloc_new(ctdb);
-       talloc_steal(tmp_ctx, hdr);
-
-       if (cnt == 0) {
-               DEBUG(DEBUG_INFO,("Daemon has exited - shutting down client\n"));
-               exit(0);
-       }
-
-       if (cnt < sizeof(*hdr)) {
-               DEBUG(DEBUG_CRIT,("Bad packet length %u in client\n", (unsigned)cnt));
-               goto done;
-       }
-       if (cnt != hdr->length) {
-               ctdb_set_error(ctdb, "Bad header length %u expected %u in client\n", 
-                              (unsigned)hdr->length, (unsigned)cnt);
-               goto done;
-       }
-
-       if (hdr->ctdb_magic != CTDB_MAGIC) {
-               ctdb_set_error(ctdb, "Non CTDB packet rejected in client\n");
-               goto done;
-       }
-
-       if (hdr->ctdb_version != CTDB_VERSION) {
-               ctdb_set_error(ctdb, "Bad CTDB version 0x%x rejected in client\n", hdr->ctdb_version);
-               goto done;
-       }
-
-       switch (hdr->operation) {
-       case CTDB_REPLY_CALL:
-               ctdb_client_reply_call(ctdb, hdr);
-               break;
-
-       case CTDB_REQ_MESSAGE:
-               ctdb_request_message(ctdb, hdr);
-               break;
-
-       case CTDB_REPLY_CONTROL:
-               ctdb_client_reply_control(ctdb, hdr);
-               break;
-
-       default:
-               DEBUG(DEBUG_CRIT,("bogus operation code:%u\n",hdr->operation));
-       }
-
-done:
-       talloc_free(tmp_ctx);
-}
-
-/*
-  connect to a unix domain socket
-*/
-int ctdb_socket_connect(struct ctdb_context *ctdb)
-{
-       struct sockaddr_un addr;
-
-       memset(&addr, 0, sizeof(addr));
-       addr.sun_family = AF_UNIX;
-       strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
-
-       ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
-       if (ctdb->daemon.sd == -1) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
-               return -1;
-       }
-
-       set_nonblocking(ctdb->daemon.sd);
-       set_close_on_exec(ctdb->daemon.sd);
-       
-       if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
-               close(ctdb->daemon.sd);
-               ctdb->daemon.sd = -1;
-               DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno));
-               return -1;
-       }
-
-       ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd, 
-                                             CTDB_DS_ALIGNMENT, 
-                                             ctdb_client_read_cb, ctdb);
-       return 0;
-}
 
 
 struct ctdb_record_handle {
@@ -331,129 +79,7 @@ int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
 
 
 
-/*
-  destroy a ctdb_call in client
-*/
-static int ctdb_client_call_destructor(struct ctdb_client_call_state *state)   
-{
-       ctdb_reqid_remove(state->ctdb_db->ctdb, state->reqid);
-       return 0;
-}
-
-/*
-  construct an event driven local ctdb_call
 
-  this is used so that locally processed ctdb_call requests are processed
-  in an event driven manner
-*/
-static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db_context *ctdb_db, 
-                                                                 struct ctdb_call *call,
-                                                                 struct ctdb_ltdb_header *header,
-                                                                 TDB_DATA *data)
-{
-       struct ctdb_client_call_state *state;
-       struct ctdb_context *ctdb = ctdb_db->ctdb;
-       int ret;
-
-       state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
-       CTDB_NO_MEMORY_NULL(ctdb, state);
-       state->call = talloc_zero(state, struct ctdb_call);
-       CTDB_NO_MEMORY_NULL(ctdb, state->call);
-
-       talloc_steal(state, data->dptr);
-
-       state->state   = CTDB_CALL_DONE;
-       *(state->call) = *call;
-       state->ctdb_db = ctdb_db;
-
-       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, ctdb->pnn);
-
-       return state;
-}
-
-/*
-  make a ctdb call to the local daemon - async send. Called from client context.
-
-  This constructs a ctdb_call request and queues it for processing. 
-  This call never blocks.
-*/
-struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, 
-                                             struct ctdb_call *call)
-{
-       struct ctdb_client_call_state *state;
-       struct ctdb_context *ctdb = ctdb_db->ctdb;
-       struct ctdb_ltdb_header header;
-       TDB_DATA data;
-       int ret;
-       size_t len;
-       struct ctdb_req_call *c;
-
-       /* if the domain socket is not yet open, open it */
-       if (ctdb->daemon.sd==-1) {
-               ctdb_socket_connect(ctdb);
-       }
-
-       ret = ctdb_ltdb_lock(ctdb_db, call->key);
-       if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to get chainlock\n"));
-               return NULL;
-       }
-
-       ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
-
-       if (ret == 0 && header.dmaster == ctdb->pnn) {
-               state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
-               talloc_free(data.dptr);
-               ctdb_ltdb_unlock(ctdb_db, call->key);
-               return state;
-       }
-
-       ctdb_ltdb_unlock(ctdb_db, call->key);
-       talloc_free(data.dptr);
-
-       state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
-       if (state == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " failed to allocate state\n"));
-               return NULL;
-       }
-       state->call = talloc_zero(state, struct ctdb_call);
-       if (state->call == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " failed to allocate state->call\n"));
-               return NULL;
-       }
-
-       len = offsetof(struct ctdb_req_call, data) + call->key.dsize + call->call_data.dsize;
-       c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CALL, len, struct ctdb_req_call);
-       if (c == NULL) {
-               DEBUG(DEBUG_ERR, (__location__ " failed to allocate packet\n"));
-               return NULL;
-       }
-
-       state->reqid     = ctdb_reqid_new(ctdb, state);
-       state->ctdb_db = ctdb_db;
-       talloc_set_destructor(state, ctdb_client_call_destructor);
-
-       c->hdr.reqid     = state->reqid;
-       c->flags         = call->flags;
-       c->db_id         = ctdb_db->db_id;
-       c->callid        = call->call_id;
-       c->hopcount      = 0;
-       c->keylen        = call->key.dsize;
-       c->calldatalen   = call->call_data.dsize;
-       memcpy(&c->data[0], call->key.dptr, call->key.dsize);
-       memcpy(&c->data[call->key.dsize], 
-              call->call_data.dptr, call->call_data.dsize);
-       *(state->call)              = *call;
-       state->call->call_data.dptr = &c->data[call->key.dsize];
-       state->call->key.dptr       = &c->data[0];
-
-       state->state  = CTDB_CALL_WAIT;
-
-
-       ctdb_client_queue_pkt(ctdb, &c->hdr);
-
-       return state;
-}
 
 
 /*
@@ -512,33 +138,6 @@ int ctdb_remove_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void
 }
 
 
-/*
-  send a message - from client context
- */
-int ctdb_send_message(struct ctdb_context *ctdb, uint32_t pnn,
-                     uint64_t srvid, TDB_DATA data)
-{
-       struct ctdb_req_message *r;
-       int len, res;
-
-       len = offsetof(struct ctdb_req_message, data) + data.dsize;
-       r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE, 
-                              len, struct ctdb_req_message);
-       CTDB_NO_MEMORY(ctdb, r);
-
-       r->hdr.destnode  = pnn;
-       r->srvid         = srvid;
-       r->datalen       = data.dsize;
-       memcpy(&r->data[0], data.dptr, data.dsize);
-       
-       res = ctdb_client_queue_pkt(ctdb, &r->hdr);
-       if (res != 0) {
-               return res;
-       }
-
-       talloc_free(r);
-       return 0;
-}
 
 
 /*
@@ -644,46 +243,12 @@ again:
 */
 int ctdb_record_store(struct ctdb_record_handle *h, TDB_DATA data)
 {
-       int ret;
-       int32_t status;
-       struct ctdb_rec_data *rec;
-       TDB_DATA recdata;
-
        if (h->ctdb_db->persistent) {
-               h->header.rsn++;
-       }
-
-       ret = ctdb_ltdb_store(h->ctdb_db, h->key, &h->header, data);
-       if (ret != 0) {
-               return ret;
-       }
-
-       /* don't need the persistent_store control for non-persistent databases */
-       if (!h->ctdb_db->persistent) {
-               return 0;
-       }
-
-       rec = ctdb_marshall_record(h, h->ctdb_db->db_id, h->key, &h->header, data);
-       if (rec == NULL) {
-               DEBUG(DEBUG_ERR,("Unable to marshall record in ctdb_record_store\n"));
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_record_store prohibited for persistent dbs\n"));
                return -1;
        }
 
-       recdata.dptr = (uint8_t *)rec;
-       recdata.dsize = rec->length;
-
-       ret = ctdb_control(h->ctdb_db->ctdb, CTDB_CURRENT_NODE, 0, 
-                          CTDB_CONTROL_PERSISTENT_STORE, 0,
-                          recdata, NULL, NULL, &status, NULL, NULL);
-
-       talloc_free(rec);
-
-       if (ret != 0 || status != 0) {
-               DEBUG(DEBUG_ERR,("Failed persistent store in ctdb_record_store\n"));
-               return -1;
-       }
-
-       return 0;
+       return ctdb_ltdb_store(h->ctdb_db, h->key, &h->header, data);
 }
 
 /*
@@ -711,241 +276,8 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 
 
 
-/*
-   called when a control completes or timesout to invoke the callback
-   function the user provided
-*/
-static void invoke_control_callback(struct event_context *ev, struct timed_event *te, 
-       struct timeval t, void *private_data)
-{
-       struct ctdb_client_control_state *state;
-       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
-       int ret;
-
-       state = talloc_get_type(private_data, struct ctdb_client_control_state);
-       talloc_steal(tmp_ctx, state);
-
-       ret = ctdb_control_recv(state->ctdb, state, state,
-                       NULL, 
-                       NULL, 
-                       NULL);
-
-       talloc_free(tmp_ctx);
-}
-
-/*
-  called when a CTDB_REPLY_CONTROL packet comes in in the client
-
-  This packet comes in response to a CTDB_REQ_CONTROL request packet. It
-  contains any reply data from the control
-*/
-static void ctdb_client_reply_control(struct ctdb_context *ctdb, 
-                                     struct ctdb_req_header *hdr)
-{
-       struct ctdb_reply_control *c = (struct ctdb_reply_control *)hdr;
-       struct ctdb_client_control_state *state;
-
-       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_client_control_state);
-       if (state == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
-               return;
-       }
-
-       if (hdr->reqid != state->reqid) {
-               /* we found a record  but it was the wrong one */
-               DEBUG(DEBUG_ERR, ("Dropped orphaned reply control with reqid:%u\n",hdr->reqid));
-               return;
-       }
-
-       state->outdata.dptr = c->data;
-       state->outdata.dsize = c->datalen;
-       state->status = c->status;
-       if (c->errorlen) {
-               state->errormsg = talloc_strndup(state, 
-                                                (char *)&c->data[c->datalen], 
-                                                c->errorlen);
-       }
-
-       /* state->outdata now uses resources from c so we dont want c
-          to just dissappear from under us while state is still alive
-       */
-       talloc_steal(state, c);
-
-       state->state = CTDB_CONTROL_DONE;
-
-       /* if we had a callback registered for this control, pull the response
-          and call the callback.
-       */
-       if (state->async.fn) {
-               event_add_timed(ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
-       }
-}
-
-
-/*
-  destroy a ctdb_control in client
-*/
-static int ctdb_control_destructor(struct ctdb_client_control_state *state)    
-{
-       ctdb_reqid_remove(state->ctdb, state->reqid);
-       return 0;
-}
-
-
-/* time out handler for ctdb_control */
-static void control_timeout_func(struct event_context *ev, struct timed_event *te, 
-       struct timeval t, void *private_data)
-{
-       struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
-
-       DEBUG(DEBUG_ERR,("control timed out. reqid:%d opcode:%d dstnode:%d\n", state->reqid, state->c->opcode, state->c->hdr.destnode));
-
-       state->state = CTDB_CONTROL_TIMEOUT;
-
-       /* if we had a callback registered for this control, pull the response
-          and call the callback.
-       */
-       if (state->async.fn) {
-               event_add_timed(state->ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
-       }
-}
-
-/* async version of send control request */
-struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb, 
-               uint32_t destnode, uint64_t srvid, 
-               uint32_t opcode, uint32_t flags, TDB_DATA data, 
-               TALLOC_CTX *mem_ctx,
-               struct timeval *timeout,
-               char **errormsg)
-{
-       struct ctdb_client_control_state *state;
-       size_t len;
-       struct ctdb_req_control *c;
-       int ret;
-
-       if (errormsg) {
-               *errormsg = NULL;
-       }
-
-       /* if the domain socket is not yet open, open it */
-       if (ctdb->daemon.sd==-1) {
-               ctdb_socket_connect(ctdb);
-       }
-
-       state = talloc_zero(mem_ctx, struct ctdb_client_control_state);
-       CTDB_NO_MEMORY_NULL(ctdb, state);
-
-       state->ctdb       = ctdb;
-       state->reqid      = ctdb_reqid_new(ctdb, state);
-       state->state      = CTDB_CONTROL_WAIT;
-       state->errormsg   = NULL;
-
-       talloc_set_destructor(state, ctdb_control_destructor);
-
-       len = offsetof(struct ctdb_req_control, data) + data.dsize;
-       c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL, 
-                              len, struct ctdb_req_control);
-       state->c            = c;        
-       CTDB_NO_MEMORY_NULL(ctdb, c);
-       c->hdr.reqid        = state->reqid;
-       c->hdr.destnode     = destnode;
-       c->hdr.reqid        = state->reqid;
-       c->opcode           = opcode;
-       c->client_id        = 0;
-       c->flags            = flags;
-       c->srvid            = srvid;
-       c->datalen          = data.dsize;
-       if (data.dsize) {
-               memcpy(&c->data[0], data.dptr, data.dsize);
-       }
-
-       /* timeout */
-       if (timeout && !timeval_is_zero(timeout)) {
-               event_add_timed(ctdb->ev, state, *timeout, control_timeout_func, state);
-       }
-
-       ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
-       if (ret != 0) {
-               talloc_free(state);
-               return NULL;
-       }
-
-       if (flags & CTDB_CTRL_FLAG_NOREPLY) {
-               talloc_free(state);
-               return NULL;
-       }
-
-       return state;
-}
-
-
-/* async version of receive control reply */
-int ctdb_control_recv(struct ctdb_context *ctdb, 
-               struct ctdb_client_control_state *state, 
-               TALLOC_CTX *mem_ctx,
-               TDB_DATA *outdata, int32_t *status, char **errormsg)
-{
-       TALLOC_CTX *tmp_ctx;
-
-       if (status != NULL) {
-               *status = -1;
-       }
-       if (errormsg != NULL) {
-               *errormsg = NULL;
-       }
-
-       if (state == NULL) {
-               return -1;
-       }
-
-       /* prevent double free of state */
-       tmp_ctx = talloc_new(ctdb);
-       talloc_steal(tmp_ctx, state);
 
-       /* loop one event at a time until we either timeout or the control
-          completes.
-       */
-       while (state->state == CTDB_CONTROL_WAIT) {
-               event_loop_once(ctdb->ev);
-       }
 
-       if (state->state != CTDB_CONTROL_DONE) {
-               DEBUG(DEBUG_ERR,(__location__ " ctdb_control_recv failed\n"));
-               if (state->async.fn) {
-                       state->async.fn(state);
-               }
-               talloc_free(tmp_ctx);
-               return -1;
-       }
-
-       if (state->errormsg) {
-               DEBUG(DEBUG_ERR,("ctdb_control error: '%s'\n", state->errormsg));
-               if (errormsg) {
-                       (*errormsg) = talloc_move(mem_ctx, &state->errormsg);
-               }
-               if (state->async.fn) {
-                       state->async.fn(state);
-               }
-               talloc_free(tmp_ctx);
-               return -1;
-       }
-
-       if (outdata) {
-               *outdata = state->outdata;
-               outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
-       }
-
-       if (status) {
-               *status = state->status;
-       }
-
-       if (state->async.fn) {
-               state->async.fn(state);
-       }
-
-       talloc_free(tmp_ctx);
-       return 0;
-}
 
 
 
@@ -1528,6 +860,44 @@ int ctdb_ctrl_getdbname(struct ctdb_context *ctdb, struct timeval timeout, uint3
        return 0;
 }
 
+/*
+  get the health status of a db
+ */
+int ctdb_ctrl_getdbhealth(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         uint32_t dbid, TALLOC_CTX *mem_ctx,
+                         const char **reason)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_DB_GET_HEALTH, 0, data,
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       if (data.dsize == 0) {
+               (*reason) = NULL;
+               return 0;
+       }
+
+       (*reason) = talloc_strndup(mem_ctx, (const char *)data.dptr, data.dsize);
+       if ((*reason) == NULL) {
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
 /*
   create a database
  */
@@ -1714,9 +1084,10 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
        }
 
        tdb_flags = persistent?TDB_DEFAULT:TDB_NOSYNC;
-       if (!ctdb->do_setsched) {
+       if (ctdb->valgrinding) {
                tdb_flags |= TDB_NOMMAP;
        }
+       tdb_flags |= TDB_DISALLOW_NESTING;
 
        ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 0, tdb_flags, O_RDWR, 0);
        if (ctdb_db->ltdb == NULL) {
@@ -1879,15 +1250,12 @@ int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *
 /*
   called on each key during a catdb
  */
-static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p)
+int ctdb_dumpdb_record(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p)
 {
        int i;
        FILE *f = (FILE *)p;
        struct ctdb_ltdb_header *h = (struct ctdb_ltdb_header *)data.dptr;
 
-       fprintf(f, "dmaster: %u\n", h->dmaster);
-       fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
-
        fprintf(f, "key(%u) = \"", (unsigned)key.dsize);
        for (i=0;i<key.dsize;i++) {
                if (ISASCII(key.dptr[i])) {
@@ -1898,7 +1266,10 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
        }
        fprintf(f, "\"\n");
 
-       fprintf(f, "data(%u) = \"", (unsigned)data.dsize);
+       fprintf(f, "dmaster: %u\n", h->dmaster);
+       fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
+
+       fprintf(f, "data(%u) = \"", (unsigned)(data.dsize - sizeof(*h)));
        for (i=sizeof(*h);i<data.dsize;i++) {
                if (ISASCII(data.dptr[i])) {
                        fprintf(f, "%c", data.dptr[i]);
@@ -1908,6 +1279,8 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
        }
        fprintf(f, "\"\n");
 
+       fprintf(f, "\n");
+
        return 0;
 }
 
@@ -1916,7 +1289,7 @@ static int dumpdb_fn(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, voi
  */
 int ctdb_dump_db(struct ctdb_db_context *ctdb_db, FILE *f)
 {
-       return ctdb_traverse(ctdb_db, dumpdb_fn, f);
+       return ctdb_traverse(ctdb_db, ctdb_dumpdb_record, f);
 }
 
 /*
@@ -1945,9 +1318,9 @@ int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t
   async freeze send control
  */
 struct ctdb_client_control_state *
-ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
-       return ctdb_control_send(ctdb, destnode, 0
+       return ctdb_control_send(ctdb, destnode, priority
                           CTDB_CONTROL_FREEZE, 0, tdb_null, 
                           mem_ctx, &timeout, NULL);
 }
@@ -1970,30 +1343,43 @@ int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct
 }
 
 /*
-  freeze a node
+  freeze databases of a certain priority
  */
-int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+int ctdb_ctrl_freeze_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
        TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
        struct ctdb_client_control_state *state;
        int ret;
 
-       state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode);
+       state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode, priority);
        ret = ctdb_ctrl_freeze_recv(ctdb, tmp_ctx, state);
        talloc_free(tmp_ctx);
 
        return ret;
 }
 
+/* Freeze all databases */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int i;
+
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb_ctrl_freeze_priority(ctdb, timeout, destnode, i) != 0) {
+                       return -1;
+               }
+       }
+       return 0;
+}
+
 /*
-  thaw a node
+  thaw databases of a certain priority
  */
-int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+int ctdb_ctrl_thaw_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
 {
        int ret;
        int32_t res;
 
-       ret = ctdb_control(ctdb, destnode, 0
+       ret = ctdb_control(ctdb, destnode, priority
                           CTDB_CONTROL_THAW, 0, tdb_null, 
                           NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
@@ -2004,6 +1390,12 @@ int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t d
        return 0;
 }
 
+/* thaw all databases */
+int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_ctrl_thaw_priority(ctdb, timeout, destnode, 0);
+}
+
 /*
   get pnn of a node, or -1
  */
@@ -2289,16 +1681,18 @@ int ctdb_ctrl_list_tunables(struct ctdb_context *ctdb,
 }
 
 
-int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb, 
-                       struct timeval timeout, uint32_t destnode, 
-                       TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips)
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+                                  struct timeval timeout, uint32_t destnode,
+                                  TALLOC_CTX *mem_ctx,
+                                  uint32_t flags,
+                                  struct ctdb_all_public_ips **ips)
 {
        int ret;
        TDB_DATA outdata;
        int32_t res;
 
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_GET_PUBLIC_IPS, 0, tdb_null, 
+                          CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null,
                           mem_ctx, &outdata, &res, &timeout, NULL);
        if (ret == 0 && res == -1) {
                DEBUG(DEBUG_ERR,(__location__ " ctdb_control to get public ips failed, falling back to ipv4-only version\n"));
@@ -2315,6 +1709,16 @@ int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
        return 0;
 }
 
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            struct ctdb_all_public_ips **ips)
+{
+       return ctdb_ctrl_get_public_ips_flags(ctdb, timeout,
+                                             destnode, mem_ctx,
+                                             0, ips);
+}
+
 int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb, 
                        struct timeval timeout, uint32_t destnode, 
                        TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips)
@@ -2348,6 +1752,162 @@ int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb,
        return 0;
 }
 
+int ctdb_ctrl_get_public_ip_info(struct ctdb_context *ctdb,
+                                struct timeval timeout, uint32_t destnode,
+                                TALLOC_CTX *mem_ctx,
+                                const ctdb_sock_addr *addr,
+                                struct ctdb_control_public_ip_info **_info)
+{
+       int ret;
+       TDB_DATA indata;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_public_ip_info *info;
+       uint32_t len;
+       uint32_t i;
+
+       indata.dptr = discard_const_p(uint8_t, addr);
+       indata.dsize = sizeof(*addr);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_PUBLIC_IP_INFO, 0, indata,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       info = (struct ctdb_control_public_ip_info *)outdata.dptr;
+       len += info->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < info->num; i++) {
+               info->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_info = (struct ctdb_control_public_ip_info *)talloc_memdup(mem_ctx,
+                                                               outdata.dptr,
+                                                               outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_info == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+                        struct timeval timeout, uint32_t destnode,
+                        TALLOC_CTX *mem_ctx,
+                        struct ctdb_control_get_ifaces **_ifaces)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_get_ifaces *ifaces;
+       uint32_t len;
+       uint32_t i;
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_IFACES, 0, tdb_null,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       ifaces = (struct ctdb_control_get_ifaces *)outdata.dptr;
+       len += ifaces->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < ifaces->num; i++) {
+               ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_ifaces = (struct ctdb_control_get_ifaces *)talloc_memdup(mem_ctx,
+                                                                 outdata.dptr,
+                                                                 outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_ifaces == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_iface_link(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            const struct ctdb_control_iface_info *info)
+{
+       int ret;
+       TDB_DATA indata;
+       int32_t res;
+
+       indata.dptr = discard_const_p(uint8_t, info);
+       indata.dsize = sizeof(*info);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_SET_IFACE_LINK_STATE, 0, indata,
+                          mem_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set iface link "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
 /*
   set/clear the permanent disabled bit on a remote node
  */
@@ -2398,11 +1958,11 @@ int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32
        nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
-                                       nodes,
+                                       nodes, 0,
                                        timeout, false, data,
                                        NULL, NULL,
                                        NULL) != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to disable node failed\n"));
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 
                talloc_free(tmp_ctx);
                return -1;
@@ -2740,16 +2300,6 @@ void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags)
        ctdb->flags |= flags;
 }
 
-/*
-  setup the local socket name
-*/
-int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname)
-{
-       ctdb->daemon.name = talloc_strdup(ctdb, socketname);
-       CTDB_NO_MEMORY(ctdb, ctdb->daemon.name);
-
-       return 0;
-}
 
 /*
   return the pnn of this node
@@ -2836,7 +2386,7 @@ static void async_callback(struct ctdb_client_control_state *state)
        */
        if (state->state != CTDB_CONTROL_DONE) {
                if ( !data->dont_log_errors) {
-                       DEBUG(DEBUG_ERR,("Async operation failed with state %d\n opcode:%u", state->state, data->opcode));
+                       DEBUG(DEBUG_ERR,("Async operation failed with state %d, opcode:%u\n", state->state, data->opcode));
                }
                data->fail_count++;
                if (data->fail_callback) {
@@ -2903,6 +2453,7 @@ int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *
 int ctdb_client_async_control(struct ctdb_context *ctdb,
                                enum ctdb_controls opcode,
                                uint32_t *nodes,
+                               uint64_t srvid,
                                struct timeval timeout,
                                bool dont_log_errors,
                                TDB_DATA data,
@@ -2928,7 +2479,7 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
        for (j=0; j<num_nodes; j++) {
                uint32_t pnn = nodes[j];
 
-               state = ctdb_control_send(ctdb, pnn, 0, opcode, 
+               state = ctdb_control_send(ctdb, pnn, srvid, opcode, 
                                          0, data, async_data, &timeout, NULL);
                if (state == NULL) {
                        DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
@@ -3010,6 +2561,40 @@ uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
        return nodes;
 }
 
+uint32_t *list_of_active_nodes_except_pnn(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               uint32_t pnn)
+{
+       int i, j, num_nodes;
+       uint32_t *nodes;
+
+       for (i=num_nodes=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == pnn) {
+                       continue;
+               }
+               num_nodes++;
+       } 
+
+       nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+       CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+       for (i=j=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == pnn) {
+                       continue;
+               }
+               nodes[j++] = node_map->nodes[i].pnn;
+       } 
+
+       return nodes;
+}
+
 uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
                                struct ctdb_node_map *node_map,
                                TALLOC_CTX *mem_ctx,
@@ -3105,27 +2690,57 @@ int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ct
                *capabilities = *((uint32_t *)outdata.dptr);
        }
 
-       return 0;
+       return 0;
+}
+
+int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities)
+{
+       struct ctdb_client_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int ret;
+
+       state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode);
+       ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities);
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/**
+ * check whether a transaction is active on a given db on a given node
+ */
+int32_t ctdb_ctrl_transaction_active(struct ctdb_context *ctdb,
+                                    uint32_t destnode,
+                                    uint32_t db_id)
+{
+       int32_t status;
+       int ret;
+       TDB_DATA indata;
+
+       indata.dptr = (uint8_t *)&db_id;
+       indata.dsize = sizeof(db_id);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_TRANS2_ACTIVE,
+                          0, indata, NULL, NULL, &status,
+                          NULL, NULL);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb control for transaction_active failed\n"));
+               return -1;
+       }
+
+       return status;
 }
 
-int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities)
-{
-       struct ctdb_client_control_state *state;
-       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
-       int ret;
-
-       state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode);
-       ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities);
-       talloc_free(tmp_ctx);
-       return ret;
-}
 
 struct ctdb_transaction_handle {
        struct ctdb_db_context *ctdb_db;
        bool in_replay;
-       /* we store the reads and writes done under a transaction one
-          list stores both reads and writes, the other just writes
-       */
+       /*
+        * we store the reads and writes done under a transaction:
+        * - one list stores both reads and writes (m_all),
+        * - the other just writes (m_write)
+        */
        struct ctdb_marshall_buffer *m_all;
        struct ctdb_marshall_buffer *m_write;
 };
@@ -3142,11 +2757,14 @@ static int ctdb_transaction_fetch_start(struct ctdb_transaction_handle *h)
 {
        struct ctdb_record_handle *rh;
        TDB_DATA key;
+       TDB_DATA data;
        struct ctdb_ltdb_header header;
        TALLOC_CTX *tmp_ctx;
        const char *keyname = CTDB_TRANSACTION_LOCK_KEY;
        int ret;
        struct ctdb_db_context *ctdb_db = h->ctdb_db;
+       pid_t pid;
+       int32_t status;
 
        key.dptr = discard_const(keyname);
        key.dsize = strlen(keyname);
@@ -3161,10 +2779,42 @@ again:
 
        rh = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, NULL);
        if (rh == NULL) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to fetch_lock database\n"));             
+               DEBUG(DEBUG_ERR,(__location__ " Failed to fetch_lock database\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       status = ctdb_ctrl_transaction_active(ctdb_db->ctdb,
+                                             CTDB_CURRENT_NODE,
+                                             ctdb_db->db_id);
+       if (status == 1) {
+               unsigned long int usec = (1000 + random()) % 100000;
+               DEBUG(DEBUG_DEBUG, (__location__ " transaction is active "
+                                   "on db_id[0x%08x]. waiting for %lu "
+                                   "microseconds\n",
+                                   ctdb_db->db_id, usec));
+               talloc_free(tmp_ctx);
+               usleep(usec);
+               goto again;
+       }
+
+       /*
+        * store the pid in the database:
+        * it is not enough that the node is dmaster...
+        */
+       pid = getpid();
+       data.dptr = (unsigned char *)&pid;
+       data.dsize = sizeof(pid_t);
+       rh->header.rsn++;
+       rh->header.dmaster = ctdb_db->ctdb->pnn;
+       ret = ctdb_ltdb_store(ctdb_db, key, &(rh->header), data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to store pid in "
+                                 "transaction record\n"));
                talloc_free(tmp_ctx);
                return -1;
        }
+
        talloc_free(rh);
 
        ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
@@ -3174,8 +2824,26 @@ again:
                return -1;
        }
 
-       ret = ctdb_ltdb_fetch(ctdb_db, key, &header, tmp_ctx, NULL);
-       if (ret != 0 || header.dmaster != ctdb_db->ctdb->pnn) {
+       ret = ctdb_ltdb_fetch(ctdb_db, key, &header, tmp_ctx, &data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to re-fetch transaction "
+                                "lock record inside transaction\n"));
+               tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+               talloc_free(tmp_ctx);
+               goto again;
+       }
+
+       if (header.dmaster != ctdb_db->ctdb->pnn) {
+               DEBUG(DEBUG_DEBUG,(__location__ " not dmaster any more on "
+                                  "transaction lock record\n"));
+               tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+               talloc_free(tmp_ctx);
+               goto again;
+       }
+
+       if ((data.dsize != sizeof(pid_t)) || (*(pid_t *)(data.dptr) != pid)) {
+               DEBUG(DEBUG_DEBUG, (__location__ " my pid is not stored in "
+                                   "the transaction lock record\n"));
                tdb_transaction_cancel(ctdb_db->ltdb->tdb);
                talloc_free(tmp_ctx);
                goto again;
@@ -3268,8 +2936,7 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
                /* the record doesn't exist - create one with us as dmaster.
                   This is only safe because we are in a transaction and this
                   is a persistent database */
-               header.dmaster = h->ctdb_db->ctdb->pnn;
-               header.rsn = 0;
+               ZERO_STRUCT(header);
        } else if (ret != 0) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to fetch record\n"));
                talloc_free(tmp_ctx);
@@ -3283,6 +2950,7 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
                return 0;
        }
 
+       header.dmaster = h->ctdb_db->ctdb->pnn;
        header.rsn++;
 
        if (!h->in_replay) {
@@ -3409,6 +3077,9 @@ again:
                           &timeout, NULL);
        if (ret != 0 || status != 0) {
                tdb_transaction_cancel(h->ctdb_db->ltdb->tdb);
+               DEBUG(DEBUG_NOTICE, (__location__ " transaction commit%s failed"
+                                    ", retrying after 1 second...\n",
+                                    (retries==0)?"":"retry "));
                sleep(1);
 
                if (ret != 0) {
@@ -3428,7 +3099,7 @@ again:
                        }
                }
 
-               if (++retries == 10) {
+               if (++retries == 100) {
                        DEBUG(DEBUG_ERR,(__location__ " Giving up transaction on db 0x%08x after %d retries failure_control=%u\n", 
                                         h->ctdb_db->db_id, retries, (unsigned)failure_control));
                        ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
@@ -3439,7 +3110,11 @@ again:
                }               
 
                if (ctdb_replay_transaction(h) != 0) {
-                       DEBUG(DEBUG_ERR,(__location__ " Failed to replay transaction\n"));
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to replay "
+                                         "transaction on db 0x%08x, "
+                                         "failure control =%u\n",
+                                         h->ctdb_db->db_id,
+                                         (unsigned)failure_control));
                        ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
                                     failure_control, CTDB_CTRL_FLAG_NOREPLY, 
                                     tdb_null, NULL, NULL, NULL, NULL, NULL);           
@@ -3454,7 +3129,11 @@ again:
        /* do the real commit locally */
        ret = tdb_transaction_commit(h->ctdb_db->ltdb->tdb);
        if (ret != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " Failed to commit transaction\n"));
+               DEBUG(DEBUG_ERR, (__location__ " Failed to commit transaction "
+                                 "on db id 0x%08x locally, "
+                                 "failure_control=%u\n",
+                                 h->ctdb_db->db_id,
+                                 (unsigned)failure_control));
                ctdb_control(ctdb, CTDB_CURRENT_NODE, h->ctdb_db->db_id, 
                             failure_control, CTDB_CTRL_FLAG_NOREPLY, 
                             tdb_null, NULL, NULL, NULL, NULL, NULL);           
@@ -3508,11 +3187,6 @@ int switch_from_server_to_client(struct ctdb_context *ctdb)
        close(ctdb->daemon.sd);
        ctdb->daemon.sd = -1;
 
-       /* the client does not need to be realtime */
-       if (ctdb->do_setsched) {
-               ctdb_restore_scheduler(ctdb);
-       }
-
        /* initialise ctdb */
        ret = ctdb_socket_connect(ctdb);
        if (ret != 0) {
@@ -3524,35 +3198,55 @@ int switch_from_server_to_client(struct ctdb_context *ctdb)
 }
 
 /*
-  tell the main daemon we are starting a new monitor event script
+  get the status of running the monitor eventscripts: NULL means never run.
  */
-int ctdb_ctrl_event_script_init(struct ctdb_context *ctdb)
+int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb, 
+               struct timeval timeout, uint32_t destnode, 
+               TALLOC_CTX *mem_ctx, enum ctdb_eventscript_call type,
+               struct ctdb_scripts_wire **script_status)
 {
        int ret;
+       TDB_DATA outdata, indata;
        int32_t res;
+       uint32_t uinttype = type;
 
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_INIT, 0, tdb_null, 
-                          ctdb, NULL, &res, NULL, NULL);
+       indata.dptr = (uint8_t *)&uinttype;
+       indata.dsize = sizeof(uinttype);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS, 0, indata,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_init\n"));
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getscriptstatus failed ret:%d res:%d\n", ret, res));
                return -1;
        }
 
+       if (outdata.dsize == 0) {
+               *script_status = NULL;
+       } else {
+               *script_status = (struct ctdb_scripts_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+               talloc_free(outdata.dptr);
+       }
+                   
        return 0;
 }
 
 /*
-  tell the main daemon we are starting a new monitor event script
+  tell the main daemon how long it took to lock the reclock file
  */
-int ctdb_ctrl_event_script_finished(struct ctdb_context *ctdb)
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
 {
        int ret;
        int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&latency;
+       data.dsize = sizeof(latency);
 
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_FINISHED, 0, tdb_null
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data
                           ctdb, NULL, &res, NULL, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_init\n"));
+               DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
                return -1;
        }
 
@@ -3560,92 +3254,136 @@ int ctdb_ctrl_event_script_finished(struct ctdb_context *ctdb)
 }
 
 /*
-  tell the main daemon we are starting to run an eventscript
+  get the name of the reclock file
  */
-int ctdb_ctrl_event_script_start(struct ctdb_context *ctdb, const char *name)
+int ctdb_ctrl_getreclock(struct ctdb_context *ctdb, struct timeval timeout,
+                        uint32_t destnode, TALLOC_CTX *mem_ctx,
+                        const char **name)
 {
        int ret;
        int32_t res;
        TDB_DATA data;
 
-       data.dptr = discard_const(name);
-       data.dsize = strlen(name)+1;
-
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_START, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_RECLOCK_FILE, 0, tdb_null, 
+                          mem_ctx, &data, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_start\n"));
                return -1;
        }
 
+       if (data.dsize == 0) {
+               *name = NULL;
+       } else {
+               *name = talloc_strdup(mem_ctx, discard_const(data.dptr));
+       }
+       talloc_free(data.dptr);
+
        return 0;
 }
 
 /*
-  tell the main daemon the status of the script we ran
+  set the reclock filename for a node
  */
-int ctdb_ctrl_event_script_stop(struct ctdb_context *ctdb, int32_t result)
+int ctdb_ctrl_setreclock(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *reclock)
 {
        int ret;
-       int32_t res;
        TDB_DATA data;
+       int32_t res;
 
-       data.dptr = (uint8_t *)&result;
-       data.dsize = sizeof(result);
+       if (reclock == NULL) {
+               data.dsize = 0;
+               data.dptr  = NULL;
+       } else {
+               data.dsize = strlen(reclock) + 1;
+               data.dptr  = discard_const(reclock);
+       }
 
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_EVENT_SCRIPT_STOP, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_RECLOCK_FILE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send event_script_stop\n"));
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setreclock failed\n"));
                return -1;
        }
 
        return 0;
 }
 
+/*
+  stop a node
+ */
+int ctdb_ctrl_stop_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_STOP_NODE, 0, tdb_null, 
+                          ctdb, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to stop node\n"));
+               return -1;
+       }
+
+       return 0;
+}
 
 /*
-  get the status of running the monitor eventscripts
+  continue a node
  */
-int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb, 
-               struct timeval timeout, uint32_t destnode, 
-               TALLOC_CTX *mem_ctx,
-               struct ctdb_monitoring_wire **script_status)
+int ctdb_ctrl_continue_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
 {
        int ret;
-       TDB_DATA outdata;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_CONTINUE_NODE, 0, tdb_null, 
+                          ctdb, NULL, NULL, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to continue node\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set the natgw state for a node
+ */
+int ctdb_ctrl_setnatgwstate(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t natgwstate)
+{
+       int ret;
+       TDB_DATA data;
        int32_t res;
 
+       data.dsize = sizeof(natgwstate);
+       data.dptr  = (uint8_t *)&natgwstate;
+
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS, 0, tdb_null
-                          mem_ctx, &outdata, &res, &timeout, NULL);
-       if (ret != 0 || res != 0 || outdata.dsize == 0) {
-               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getscriptstatus failed ret:%d res:%d\n", ret, res));
+                          CTDB_CONTROL_SET_NATGWSTATE, 0, data
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setnatgwstate failed\n"));
                return -1;
        }
 
-       *script_status = (struct ctdb_monitoring_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
-       talloc_free(outdata.dptr);
-                   
        return 0;
 }
 
 /*
-  tell the main daemon how long it took to lock the reclock file
+  set the lmaster role for a node
  */
-int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+int ctdb_ctrl_setlmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t lmasterrole)
 {
        int ret;
-       int32_t res;
        TDB_DATA data;
+       int32_t res;
 
-       data.dptr = (uint8_t *)&latency;
-       data.dsize = sizeof(latency);
+       data.dsize = sizeof(lmasterrole);
+       data.dptr  = (uint8_t *)&lmasterrole;
 
-       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data, 
-                          ctdb, NULL, &res, NULL, NULL);
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_LMASTERROLE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setlmasterrole failed\n"));
                return -1;
        }
 
@@ -3653,57 +3391,165 @@ int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval
 }
 
 /*
-  get the name of the reclock file
+  set the recmaster role for a node
  */
-int ctdb_ctrl_getreclock(struct ctdb_context *ctdb, struct timeval timeout,
-                        uint32_t destnode, TALLOC_CTX *mem_ctx,
-                        const char **name)
+int ctdb_ctrl_setrecmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmasterrole)
 {
        int ret;
-       int32_t res;
        TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(recmasterrole);
+       data.dptr  = (uint8_t *)&recmasterrole;
 
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_GET_RECLOCK_FILE, 0, tdb_null
-                          mem_ctx, &data, &res, &timeout, NULL);
+                          CTDB_CONTROL_SET_RECMASTERROLE, 0, data
+                          NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmasterrole failed\n"));
                return -1;
        }
 
-       if (data.dsize == 0) {
-               *name = NULL;
-       } else {
-               *name = talloc_strdup(mem_ctx, discard_const(data.dptr));
+       return 0;
+}
+
+/* enable an eventscript
+ */
+int ctdb_ctrl_enablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = strlen(script) + 1;
+       data.dptr  = discard_const(script);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_ENABLE_SCRIPT, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for enablescript failed\n"));
+               return -1;
        }
-       talloc_free(data.dptr);
 
        return 0;
 }
 
-/*
-  set the reclock filename for a node
+/* disable an eventscript
  */
-int ctdb_ctrl_setreclock(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *reclock)
+int ctdb_ctrl_disablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script)
 {
        int ret;
        TDB_DATA data;
        int32_t res;
 
-       if (reclock == NULL) {
-               data.dsize = 0;
-               data.dptr  = NULL;
-       } else {
-               data.dsize = strlen(reclock) + 1;
-               data.dptr  = discard_const(reclock);
+       data.dsize = strlen(script) + 1;
+       data.dptr  = discard_const(script);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_DISABLE_SCRIPT, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for disablescript failed\n"));
+               return -1;
        }
 
+       return 0;
+}
+
+
+int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_ban_time *bantime)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(*bantime);
+       data.dptr  = (uint8_t *)bantime;
+
        ret = ctdb_control(ctdb, destnode, 0, 
-                          CTDB_CONTROL_SET_RECLOCK_FILE, 0, data, 
+                          CTDB_CONTROL_SET_BAN_STATE, 0, data, 
                           NULL, NULL, &res, &timeout, NULL);
        if (ret != 0 || res != 0) {
-               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setreclock failed\n"));
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_ban_time **bantime)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_BAN_STATE, 0, tdb_null,
+                          tmp_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       *bantime = (struct ctdb_ban_time *)talloc_steal(mem_ctx, outdata.dptr);
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+
+int ctdb_ctrl_set_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_db_priority *db_prio)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)db_prio;
+       data.dsize = sizeof(*db_prio);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_db_priority failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+int ctdb_ctrl_get_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t db_id, uint32_t *priority)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)&db_id;
+       data.dsize = sizeof(db_id);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res < 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_db_priority failed\n"));
+               talloc_free(tmp_ctx);
                return -1;
        }
 
+       if (priority) {
+               *priority = res;
+       }
+
+       talloc_free(tmp_ctx);
+
        return 0;
 }