vacuum: turn the vacuuming on lmaster into a three-phase process.
authorMichael Adam <obnox@samba.org>
Sat, 5 Jan 2013 00:20:18 +0000 (01:20 +0100)
committerMichael Adam <obnox@samba.org>
Fri, 26 Apr 2013 14:52:02 +0000 (16:52 +0200)
More precisely, before locally deleting an empty record, that has been
migrated with data and that we are dmaster and laster for, we now perform
the deletion on the other nodes in two steps instead of a single step.

- First send out the list of records to be deleted to all
  other nodes with the new RECEIVE_RECORDS control to store
  the lmaster's current empty copy.
- Then send those records that could be deleted on all nodes
  to all nodes again with the TRY_DELETE_RECORDS control
  as before for deletion.
- Finally delete those records locally that were successfully
  deleted remotely in the previous step.

This fixes an old race where a recovery that hits the vacuum process
square between the eyes can create gaps in the record's history and
hence let the records resurrect. In the case of the locking.tdb,
that could mean that a file that was already closed, was recorded as
being open and locked again, so samba clients were locked out of that
file until samba was restarted.

Signed-off-by: Michael Adam <obnox@samba.org>
Reviewed-By: Amitay Isaacs <amitay@gmail.com>
(cherry picked from commit eee23d44b6427be8ab49bbfcee3abb62f37dfcc7)

server/ctdb_vacuum.c

index c6858909469b039cab0aaad46e3557bfeac2d22f..9480edbad268d26d8c6a2a243def7d8ee955c718 100644 (file)
@@ -98,6 +98,7 @@ struct delete_record_data {
 
 struct delete_records_list {
        struct ctdb_marshall_buffer *records;
+       struct vacuum_data *vdata;
 };
 
 /**
@@ -304,6 +305,133 @@ static int delete_marshall_traverse(void *param, void *data)
        return 0;
 }
 
+/**
+ * Variant of delete_marshall_traverse() that bumps the
+ * RSN of each traversed record in the database.
+ *
+ * This is needed to ensure that when rolling out our
+ * empty record copy before remote deletion, we as the
+ * record's dmaster keep a higher RSN than the non-dmaster
+ * nodes. This is needed to prevent old copies from
+ * resurrection in recoveries.
+ */
+static int delete_marshall_traverse_first(void *param, void *data)
+{
+       struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+       struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+       struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA tdb_data, ctdb_data;
+       uint32_t lmaster;
+       uint32_t hash = ctdb_hash(&(dd->key));
+       int res;
+
+       res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
+               recs->vdata->delete_skipped++;
+               talloc_free(dd);
+               return 0;
+       }
+
+       /*
+        * Verify that the record is still empty, its RSN has not
+        * changed and that we are still its lmaster and dmaster.
+        */
+
+       tdb_data = tdb_fetch(ctdb_db->ltdb->tdb, dd->key);
+       if (tdb_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] does not exist or is not"
+                                  " a ctdb-record.  skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       if (tdb_data.dsize > sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been recycled. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       header = (struct ctdb_ltdb_header *)tdb_data.dptr;
+
+       if (header->dmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been migrated away. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       if (header->rsn != dd->hdr.rsn) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] seems to have been "
+                                  "migrated away and back again (with empty "
+                                  "data). skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+       if (lmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in "
+                                  "delete list (key hash [0x%08x], db[%s]). "
+                                  "Strange! skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       /*
+        * Increment the record's RSN to ensure the dmaster (i.e. the current
+        * node) has the highest RSN of the record in the cluster.
+        * This is to prevent old record copies from resurrecting in recoveries
+        * if something should fail during the deletion process.
+        * Note that ctdb_ltdb_store_server() increments the RSN if called
+        * on the record's dmaster.
+        */
+
+       ctdb_data.dptr = tdb_data.dptr + sizeof(struct ctdb_ltdb_header);
+       ctdb_data.dsize = tdb_data.dsize - sizeof(struct ctdb_ltdb_header);
+
+       res = ctdb_ltdb_store(ctdb_db, dd->key, header, ctdb_data);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ ": Failed to store record with "
+                                 "key hash [0x%08x] on database db[%s].\n",
+                                 hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       goto done;
+
+skip:
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       recs->vdata->delete_skipped++;
+       talloc_free(dd);
+       dd = NULL;
+
+done:
+       if (tdb_data.dptr != NULL) {
+               free(tdb_data.dptr);
+       }
+
+       if (dd == NULL) {
+               return 0;
+       }
+
+       return delete_marshall_traverse(param, data);
+}
+
 /**
  * traverse function for the traversal of the delete_queue,
  * the fast-path vacuuming list.
@@ -508,11 +636,12 @@ static int delete_record_traverse(void *param, void *data)
                goto done;
        }
 
-
-       if (header->rsn != dd->hdr.rsn) {
+       if (header->rsn != dd->hdr.rsn + 1) {
                /*
                 * The record has been migrated off the node and back again.
                 * But not requeued for deletion. Skip it.
+                * (Note that the first marshall traverse has bumped the RSN
+                *  on disk.)
                 */
                DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
                                   "on database db[%s] seems to have been "
@@ -723,8 +852,7 @@ static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
        vdata->delete_left = vdata->delete_count;
 
        /*
-        * now tell all the active nodes to delete all these records
-        * (if possible)
+        * get the list of currently active nodes
         */
 
        ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(),
@@ -743,6 +871,20 @@ static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
        /* yuck! ;-) */
        num_active_nodes = talloc_get_size(active_nodes)/sizeof(*active_nodes);
 
+       /*
+        * Now delete the records all active nodes in a three-phase process:
+        * 1) send all active remote nodes the current empty copy with this
+        *    node as DMASTER
+        * 2) if all nodes could store the new copy,
+        *    tell all the active remote nodes to delete all their copy
+        * 3) if all remote nodes deleted their record copy, delete it locally
+        */
+
+       /*
+        * Step 1:
+        * Send currently empty record copy to all active nodes for storing.
+        */
+
        recs = talloc_zero(tmp_ctx, struct delete_records_list);
        if (recs == NULL) {
                DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
@@ -758,11 +900,114 @@ static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
                goto done;
        }
        recs->records->db_id = ctdb_db->db_id;
+       recs->vdata = vdata;
 
        /*
         * traverse the tree of all records we want to delete and
         * create a blob we can send to the other nodes.
+        *
+        * We call delete_marshall_traverse_first() to bump the
+        * records' RSNs in the database, to ensure we (as dmaster)
+        * keep the highest RSN of the records in the cluster.
+        */
+       trbt_traversearray32(vdata->delete_list, 1,
+                            delete_marshall_traverse_first, recs);
+
+       indata.dsize = talloc_get_size(recs->records);
+       indata.dptr  = (void *)recs->records;
+
+       for (i = 0; i < num_active_nodes; i++) {
+               struct ctdb_marshall_buffer *records;
+               struct ctdb_rec_data *rec;
+               int32_t res;
+               TDB_DATA outdata;
+
+               ret = ctdb_control(ctdb, active_nodes[i], 0,
+                               CTDB_CONTROL_RECEIVE_RECORDS, 0,
+                               indata, recs, &outdata, &res,
+                               NULL, NULL);
+               if (ret != 0 || res != 0) {
+                       DEBUG(DEBUG_ERR, ("Error storing record copies on "
+                                         "node %u: ret[%d] res[%d]\n",
+                                         active_nodes[i], ret, res));
+                       ret = -1;
+                       goto done;
+               }
+
+               /*
+                * outdata contains the list of records coming back
+                * from the node: These are the records that the
+                * remote node could not store. We remove these from
+                * the list to process further.
+                */
+               records = (struct ctdb_marshall_buffer *)outdata.dptr;
+               rec = (struct ctdb_rec_data *)&records->data[0];
+               while (records->count-- > 1) {
+                       TDB_DATA reckey, recdata;
+                       struct ctdb_ltdb_header *rechdr;
+                       struct delete_record_data *dd;
+
+                       reckey.dptr = &rec->data[0];
+                       reckey.dsize = rec->keylen;
+                       recdata.dptr = &rec->data[reckey.dsize];
+                       recdata.dsize = rec->datalen;
+
+                       if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+                               DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+                               ret = -1;
+                               goto done;
+                       }
+                       rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+                       recdata.dptr += sizeof(*rechdr);
+                       recdata.dsize -= sizeof(*rechdr);
+
+                       dd = (struct delete_record_data *)trbt_lookup32(
+                                       vdata->delete_list,
+                                       ctdb_hash(&reckey));
+                       if (dd != NULL) {
+                               /*
+                                * The other node could not store the record
+                                * copy and it is the first node that failed.
+                                * So we should remove it from the tree and
+                                * update statistics.
+                                */
+                               talloc_free(dd);
+                               vdata->delete_remote_error++;
+                               vdata->delete_left--;
+                       }
+
+                       rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+               }
+       }
+
+       if (vdata->delete_left == 0) {
+               goto success;
+       }
+
+       /*
+        * Step 2:
+        * Send the remaining records to all active nodes for deletion.
+        *
+        * The lmaster's (i.e. our) copies of these records have been stored
+        * successfully on the other nodes.
         */
+
+       /*
+        * Create a marshall blob from the remaining list of records to delete.
+        */
+
+       talloc_free(recs->records);
+
+       recs->records = (struct ctdb_marshall_buffer *)
+               talloc_zero_size(recs,
+                                offsetof(struct ctdb_marshall_buffer, data));
+       if (recs->records == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               ret = -1;
+               goto done;
+       }
+       recs->records->db_id = ctdb_db->db_id;
+
        trbt_traversearray32(vdata->delete_list, 1,
                             delete_marshall_traverse, recs);
 
@@ -790,18 +1035,8 @@ static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
                /*
                 * outdata contains the list of records coming back
                 * from the node: These are the records that the
-                * remote node could not delete.
-                *
-                * NOTE: There is a problem here:
-                *
-                * When a node failed to delete the record, but
-                * others succeeded, we may have created gaps in the
-                * history of the record. Hence when a node dies, an
-                * closed file handle might be resurrected or an open
-                * file handle might be lost, leading to blocked access
-                * or data corruption.
-                *
-                * TODO: This needs to be fixed!
+                * remote node could not delete. We remove these from
+                * the list to delete locally.
                 */
                records = (struct ctdb_marshall_buffer *)outdata.dptr;
                rec = (struct ctdb_rec_data *)&records->data[0];
@@ -843,17 +1078,23 @@ static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
                }
        }
 
-       if (vdata->delete_left > 0) {
-               /*
-                * The only records remaining in the tree are those
-                * records which all other nodes could successfully
-                * delete, so we can safely delete them on the
-                * lmaster as well.
-                */
-               trbt_traversearray32(vdata->delete_list, 1,
-                                    delete_record_traverse, vdata);
+       if (vdata->delete_left == 0) {
+               goto success;
        }
 
+       /*
+        * Step 3:
+        * Delete the remaining records locally.
+        *
+        * These records have successfully been deleted on all
+        * active remote nodes.
+        */
+
+       trbt_traversearray32(vdata->delete_list, 1,
+                            delete_record_traverse, vdata);
+
+success:
+
        if (vdata->delete_count > 0) {
                DEBUG(DEBUG_INFO,
                      (__location__