From 02ee9dfd3c6b09f5c5172a7e38738c20b7f0aecd Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Wed, 28 Oct 2009 22:55:44 +0100 Subject: [PATCH] client: fix race condition with concurrent transactions on the same node. In ctdb_transaction_commit(), when the trans2_commit control fails, there is a race condition in the 1 second sleep between the local transaction_cancel and the call to ctdb_replay_transaction(): The database is not locked, and neither is the transaction_lock record. So another client can start and possibly complete a new transaction in this gap, but only on the same node: The locking of the transaction_lock record on a different node which involves migration of the record to the other node has been disabled by introduction of the transaction_active flag on the db which closes precisely this gap from the start of the commit until the call to TRANS2_FINISH or TRANS2_ERROR. But this mechanism does not cover the case where a process on the same node tries to start a transaction: There is no obstacle to locking the transaction_lock record because the record does not need to be migrated. This commit closes this race condition in ctdb_transaction_fetch_start() by using the new ctdb_ctrl_transaction_active() call to ask the local ctdb daemon whether it has a transaction running on the database. If so, the check is repeated until the running transaction is done. This does introduce an additional call to the local ctdbd when starting transactions, but it does close the (hopefully) last race condition. Michael --- client/ctdb_client.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/client/ctdb_client.c b/client/ctdb_client.c index fdbcc0b8..e7bd5f89 100644 --- a/client/ctdb_client.c +++ b/client/ctdb_client.c @@ -3198,6 +3198,7 @@ static int ctdb_transaction_fetch_start(struct ctdb_transaction_handle *h) int ret; struct ctdb_db_context *ctdb_db = h->ctdb_db; pid_t pid; + int32_t status; key.dptr = discard_const(keyname); key.dsize = strlen(keyname); @@ -3208,6 +3209,17 @@ static int ctdb_transaction_fetch_start(struct ctdb_transaction_handle *h) } again: + status = ctdb_ctrl_transaction_active(ctdb_db->ctdb, + CTDB_CURRENT_NODE, + ctdb_db->db_id); + if (status == 1) { + DEBUG(DEBUG_NOTICE, (__location__ " transaction is active " + "on db_id[%u]. waiting for 1 second\n", + ctdb_db->db_id)); + sleep(1); + goto again; + } + tmp_ctx = talloc_new(h); rh = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, NULL); -- 2.34.1