From Wolfgang Mueller,
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Tue, 13 Oct 2009 21:23:49 +0000 (08:23 +1100)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Tue, 13 Oct 2009 21:23:49 +0000 (08:23 +1100)
when we detect a dmaster migration error, create a recovery to replair the databases instead of calling ctdb_fatal()

server/ctdb_call.c

index b666a9ed1bc80d58d4467578e9bab457743c29f6..c2591a7b7a5ddb4205844586597bc2450a875b25 100644 (file)
@@ -343,7 +343,13 @@ void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr
                         (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid,
                         (key.dsize >= 4)?(*(uint32_t *)key.dptr):0));
                if (header.rsn != 0 || header.dmaster != ctdb->pnn) {
-                       ctdb_fatal(ctdb, "ctdb_req_dmaster from non-master");
+                       /*
+                        * we used to exit here with a ctdb_fatal(ctdb, "ctdb_req_dmaster from non-master");
+                        * as long as we haven't found a protocol problem we just initiate recovery
+                        */
+                       DEBUG(DEBUG_ALERT,(__location__"real-dmaster problem triggering recovery\n"));
+                       ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+                       ctdb_ltdb_unlock(ctdb_db, key);
                        return;
                }
        }