Dont allow client processes to attach to databases while we are still in recovery...
[sahlberg/ctdb.git] / server / ctdb_ltdb_server.c
index ff0d347e829a1740643def6a315abfb229dddd23..8340c37fbdef9ed0bd693d0b38ce859a3d7d0c87 100644 (file)
@@ -5,7 +5,7 @@
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
+   the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.
    
    This program is distributed in the hope that it will be useful,
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 
 #include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
 #include "lib/tdb/include/tdb.h"
 #include "system/network.h"
 #include "system/filesys.h"
+#include "system/dir.h"
+#include "system/time.h"
 #include "../include/ctdb_private.h"
 #include "db_wrap.h"
 #include "lib/util/dlinklist.h"
+#include <ctype.h>
+
+#define PERSISTENT_HEALTH_TDB "persistent_health.tdb"
 
 /*
   this is the dummy null procedure that all databases support
@@ -63,12 +67,12 @@ static void lock_fetch_callback(void *p)
        struct lock_fetch_state *state = talloc_get_type(p, struct lock_fetch_state);
        if (!state->ignore_generation &&
            state->generation != state->ctdb->vnn_map->generation) {
-               DEBUG(0,("Discarding previous generation lockwait packet\n"));
+               DEBUG(DEBUG_NOTICE,("Discarding previous generation lockwait packet\n"));
                talloc_free(state->hdr);
                return;
        }
        state->recv_pkt(state->recv_context, state->hdr);
-       DEBUG(2,(__location__ " PACKET REQUEUED\n"));
+       DEBUG(DEBUG_INFO,(__location__ " PACKET REQUEUED\n"));
 }
 
 
@@ -137,7 +141,6 @@ int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db,
        /* now the contended path */
        h = ctdb_lockwait(ctdb_db, key, lock_fetch_callback, state);
        if (h == NULL) {
-               tdb_chainunlock(tdb, key);
                return -1;
        }
 
@@ -166,7 +169,11 @@ int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db,
        if (ret == 0) {
                ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data);
                if (ret != 0) {
-                       ctdb_ltdb_unlock(ctdb_db, key);
+                       int uret;
+                       uret = ctdb_ltdb_unlock(ctdb_db, key);
+                       if (uret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", uret));
+                       }
                }
        }
        return ret;
@@ -181,90 +188,532 @@ static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db)
        struct tdb_context *tdb = ctdb_db->ltdb->tdb;
        int count = tdb_traverse_read(tdb, NULL, NULL);
        if (count != 0) {
-               DEBUG(0,(__location__ " tdb '%s' not empty on attach! aborting\n",
+               DEBUG(DEBUG_ALERT,(__location__ " tdb '%s' not empty on attach! aborting\n",
                         ctdb_db->db_path));
                ctdb_fatal(ctdb_db->ctdb, "database not empty on attach");
        }
 }
 
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+                               struct ctdb_db_context *ctdb_db)
+{
+       struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+       char *old;
+       char *reason = NULL;
+       TDB_DATA key;
+       TDB_DATA val;
+
+       key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+       key.dsize = strlen(ctdb_db->db_name);
+
+       old = ctdb_db->unhealthy_reason;
+       ctdb_db->unhealthy_reason = NULL;
+
+       val = tdb_fetch(tdb, key);
+       if (val.dsize > 0) {
+               reason = talloc_strndup(ctdb_db,
+                                       (const char *)val.dptr,
+                                       val.dsize);
+               if (reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n",
+                                          (int)val.dsize));
+                       ctdb_db->unhealthy_reason = old;
+                       free(val.dptr);
+                       return -1;
+               }
+       }
+
+       if (val.dptr) {
+               free(val.dptr);
+       }
+
+       talloc_free(old);
+       ctdb_db->unhealthy_reason = reason;
+       return 0;
+}
+
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+                                 struct ctdb_db_context *ctdb_db,
+                                 const char *given_reason,/* NULL means healthy */
+                                 int num_healthy_nodes)
+{
+       struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+       int ret;
+       TDB_DATA key;
+       TDB_DATA val;
+       char *new_reason = NULL;
+       char *old_reason = NULL;
+
+       ret = tdb_transaction_start(tdb);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n",
+                                  tdb_name(tdb), ret, tdb_errorstr(tdb)));
+               return -1;
+       }
+
+       ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+                                  ctdb_db->db_name, ret));
+               return -1;
+       }
+       old_reason = ctdb_db->unhealthy_reason;
+
+       key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+       key.dsize = strlen(ctdb_db->db_name);
+
+       if (given_reason) {
+               new_reason = talloc_strdup(ctdb_db, given_reason);
+               if (new_reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n",
+                                         given_reason));
+                       return -1;
+               }
+       } else if (old_reason && num_healthy_nodes == 0) {
+               /*
+                * If the reason indicates ok, but there where no healthy nodes
+                * available, that it means, we have not recovered valid content
+                * of the db. So if there's an old reason, prefix it with
+                * "NO-HEALTHY-NODES - "
+                */
+               const char *prefix;
+
+#define _TMP_PREFIX "NO-HEALTHY-NODES - "
+               ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX));
+               if (ret != 0) {
+                       prefix = _TMP_PREFIX;
+               } else {
+                       prefix = "";
+               }
+               new_reason = talloc_asprintf(ctdb_db, "%s%s",
+                                        prefix, old_reason);
+               if (new_reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n",
+                                         prefix, old_reason));
+                       return -1;
+               }
+#undef _TMP_PREFIX
+       }
+
+       if (new_reason) {
+               val.dptr = discard_const_p(uint8_t, new_reason);
+               val.dsize = strlen(new_reason);
+
+               ret = tdb_store(tdb, key, val, TDB_REPLACE);
+               if (ret != 0) {
+                       tdb_transaction_cancel(tdb);
+                       DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n",
+                                          tdb_name(tdb), ctdb_db->db_name, new_reason,
+                                          ret, tdb_errorstr(tdb)));
+                       talloc_free(new_reason);
+                       return -1;
+               }
+               DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n",
+                                  ctdb_db->db_name, new_reason));
+       } else if (old_reason) {
+               ret = tdb_delete(tdb, key);
+               if (ret != 0) {
+                       tdb_transaction_cancel(tdb);
+                       DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n",
+                                          tdb_name(tdb), ctdb_db->db_name,
+                                          ret, tdb_errorstr(tdb)));
+                       talloc_free(new_reason);
+                       return -1;
+               }
+               DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n",
+                                  ctdb_db->db_name));
+       }
+
+       ret = tdb_transaction_commit(tdb);
+       if (ret != TDB_SUCCESS) {
+               DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n",
+                                  tdb_name(tdb), ret, tdb_errorstr(tdb)));
+               talloc_free(new_reason);
+               return -1;
+       }
+
+       talloc_free(old_reason);
+       ctdb_db->unhealthy_reason = new_reason;
+
+       return 0;
+}
+
+static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb,
+                                    struct ctdb_db_context *ctdb_db)
+{
+       time_t now = time(NULL);
+       char *new_path;
+       char *new_reason;
+       int ret;
+       struct tm *tm;
+
+       tm = gmtime(&now);
+
+       /* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */
+       new_path = talloc_asprintf(ctdb_db, "%s.corrupted."
+                                  "%04u%02u%02u%02u%02u%02u.0Z",
+                                  ctdb_db->db_path,
+                                  tm->tm_year+1900, tm->tm_mon+1,
+                                  tm->tm_mday, tm->tm_hour, tm->tm_min,
+                                  tm->tm_sec);
+       if (new_path == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+
+       new_reason = talloc_asprintf(ctdb_db,
+                                    "ERROR - Backup of corrupted TDB in '%s'",
+                                    new_path);
+       if (new_reason == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+       ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0);
+       talloc_free(new_reason);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,(__location__
+                                ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n",
+                                ctdb_db->db_path));
+               return -1;
+       }
+
+       ret = rename(ctdb_db->db_path, new_path);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,(__location__
+                                 ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n",
+                                 ctdb_db->db_path, new_path,
+                                 errno, strerror(errno)));
+               talloc_free(new_path);
+               return -1;
+       }
+
+       DEBUG(DEBUG_CRIT,(__location__
+                        ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n",
+                        ctdb_db->db_path, new_path));
+       talloc_free(new_path);
+       return 0;
+}
+
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb)
+{
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       int ok = 0;
+       int fail = 0;
+
+       for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+               if (!ctdb_db->persistent) {
+                       continue;
+               }
+
+               ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ALERT,(__location__
+                                          " load persistent health for '%s' failed\n",
+                                          ctdb_db->db_path));
+                       return -1;
+               }
+
+               if (ctdb_db->unhealthy_reason == NULL) {
+                       ok++;
+                       DEBUG(DEBUG_INFO,(__location__
+                                  " persistent db '%s' healthy\n",
+                                  ctdb_db->db_path));
+                       continue;
+               }
+
+               fail++;
+               DEBUG(DEBUG_ALERT,(__location__
+                                  " persistent db '%s' unhealthy: %s\n",
+                                  ctdb_db->db_path,
+                                  ctdb_db->unhealthy_reason));
+       }
+       DEBUG((fail!=0)?DEBUG_ALERT:DEBUG_NOTICE,
+             ("ctdb_recheck_presistent_health: OK[%d] FAIL[%d]\n",
+              ok, fail));
+
+       if (fail != 0) {
+               return -1;
+       }
+
+       return 0;
+}
+
+
 /*
-  a client has asked to attach a new database
+  mark a database - as healthy
  */
-int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, TDB_DATA indata,
-                              TDB_DATA *outdata)
+int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata)
 {
-       const char *db_name = (const char *)indata.dptr;
-       struct ctdb_db_context *ctdb_db, *tmp_db;
+       uint32_t db_id = *(uint32_t *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
        int ret;
+       bool may_recover = false;
 
-       /* see if we already have this name */
-       for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
-               if (strcmp(db_name, tmp_db->db_name) == 0) {
-                       /* this is not an error */
-                       outdata->dptr  = (uint8_t *)&tmp_db->db_id;
-                       outdata->dsize = sizeof(tmp_db->db_id);
-                       return 0;
-               }
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               may_recover = true;
+       }
+
+       ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, 1);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__
+                                " ctdb_update_persistent_health(%s) failed\n",
+                                ctdb_db->db_name));
+               return -1;
+       }
+
+       if (may_recover && !ctdb->done_startup) {
+               DEBUG(DEBUG_ERR, (__location__ " db %s become healthy  - force recovery for startup\n",
+                                 ctdb_db->db_name));
+               ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+       }
+
+       return 0;
+}
+
+int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
+                                  TDB_DATA indata,
+                                  TDB_DATA *outdata)
+{
+       uint32_t db_id = *(uint32_t *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+               return -1;
+       }
+
+       ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__
+                                " ctdb_load_persistent_health(%s) failed\n",
+                                ctdb_db->db_name));
+               return -1;
        }
 
+       *outdata = tdb_null;
+       if (ctdb_db->unhealthy_reason) {
+               outdata->dptr = (uint8_t *)ctdb_db->unhealthy_reason;
+               outdata->dsize = strlen(ctdb_db->unhealthy_reason)+1;
+       }
+
+       return 0;
+}
+
+/*
+  attach to a database, handling both persistent and non-persistent databases
+  return 0 on success, -1 on failure
+ */
+static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
+                            bool persistent, const char *unhealthy_reason,
+                            bool jenkinshash)
+{
+       struct ctdb_db_context *ctdb_db, *tmp_db;
+       int ret;
+       struct TDB_DATA key;
+       unsigned tdb_flags;
+       int mode = 0600;
+       int remaining_tries = 0;
+
        ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
        CTDB_NO_MEMORY(ctdb, ctdb_db);
 
+       ctdb_db->priority = 1;
        ctdb_db->ctdb = ctdb;
        ctdb_db->db_name = talloc_strdup(ctdb_db, db_name);
        CTDB_NO_MEMORY(ctdb, ctdb_db->db_name);
 
-       ctdb_db->db_id = ctdb_hash(&indata);
-
-       outdata->dptr  = (uint8_t *)&ctdb_db->db_id;
-       outdata->dsize = sizeof(ctdb_db->db_id);
+       key.dsize = strlen(db_name)+1;
+       key.dptr  = discard_const(db_name);
+       ctdb_db->db_id = ctdb_hash(&key);
+       ctdb_db->persistent = persistent;
 
        /* check for hash collisions */
        for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
                if (tmp_db->db_id == ctdb_db->db_id) {
-                       DEBUG(0,("db_id 0x%x hash collision. name1='%s' name2='%s'\n",
+                       DEBUG(DEBUG_CRIT,("db_id 0x%x hash collision. name1='%s' name2='%s'\n",
                                 tmp_db->db_id, db_name, tmp_db->db_name));
                        talloc_free(ctdb_db);
                        return -1;
                }
        }
 
-       if (ctdb->db_directory == NULL) {
-               ctdb->db_directory = VARDIR "/ctdb";
+       if (persistent) {
+               if (unhealthy_reason) {
+                       ret = ctdb_update_persistent_health(ctdb, ctdb_db,
+                                                           unhealthy_reason, 0);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n",
+                                                  ctdb_db->db_name, unhealthy_reason, ret));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+               }
+
+               if (ctdb->max_persistent_check_errors > 0) {
+                       remaining_tries = 1;
+               }
+               if (ctdb->done_startup) {
+                       remaining_tries = 0;
+               }
+
+               ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+                                  ctdb_db->db_name, ret));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
        }
 
-       /* make sure the db directory exists */
-       if (mkdir(ctdb->db_directory, 0700) == -1 && errno != EEXIST) {
-               DEBUG(0,(__location__ " Unable to create ctdb directory '%s'\n", 
-                        ctdb->db_directory));
+       if (ctdb_db->unhealthy_reason && remaining_tries == 0) {
+               DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n",
+                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
                talloc_free(ctdb_db);
                return -1;
        }
 
+       if (ctdb_db->unhealthy_reason) {
+               /* this is just a warning, but we want that in the log file! */
+               DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n",
+                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
        /* open the database */
        ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u", 
-                                          ctdb->db_directory, 
-                                          db_name, ctdb->vnn);
+                                          persistent?ctdb->db_directory_persistent:ctdb->db_directory, 
+                                          db_name, ctdb->pnn);
 
-       ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 0, 
-                                     TDB_CLEAR_IF_FIRST, O_CREAT|O_RDWR, 0666);
+       tdb_flags = persistent? TDB_DEFAULT : TDB_CLEAR_IF_FIRST | TDB_NOSYNC;
+       if (ctdb->valgrinding) {
+               tdb_flags |= TDB_NOMMAP;
+       }
+       tdb_flags |= TDB_DISALLOW_NESTING;
+       if (jenkinshash) {
+               tdb_flags |= TDB_INCOMPATIBLE_HASH;
+       }
+
+again:
+       ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 
+                                     ctdb->tunable.database_hash_size, 
+                                     tdb_flags, 
+                                     O_CREAT|O_RDWR, mode);
        if (ctdb_db->ltdb == NULL) {
-               DEBUG(0,("Failed to open tdb '%s'\n", ctdb_db->db_path));
-               talloc_free(ctdb_db);
-               return -1;
+               struct stat st;
+               int saved_errno = errno;
+
+               if (!persistent) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               if (remaining_tries == 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               ret = stat(ctdb_db->db_path, &st);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               remaining_tries--;
+               mode = st.st_mode;
+               goto again;
        }
 
-       ctdb_check_db_empty(ctdb_db);
+       if (!persistent) {
+               ctdb_check_db_empty(ctdb_db);
+       } else {
+               ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL);
+               if (ret != 0) {
+                       int fd;
+                       struct stat st;
+
+                       DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n",
+                                         ctdb_db->db_path, ret,
+                                         tdb_errorstr(ctdb_db->ltdb->tdb)));
+                       if (remaining_tries == 0) {
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       fd = tdb_fd(ctdb_db->ltdb->tdb);
+                       ret = fstat(fd, &st);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_CRIT,(__location__
+                                                 "Failed to fstat() persistent tdb '%s': %d - %s\n",
+                                                 ctdb_db->db_path,
+                                                 errno,
+                                                 strerror(errno)));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       /* close the TDB */
+                       talloc_free(ctdb_db->ltdb);
+                       ctdb_db->ltdb = NULL;
+
+                       ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n",
+                                                 ctdb_db->db_path));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       remaining_tries--;
+                       mode = st.st_mode;
+                       goto again;
+               }
+       }
 
        DLIST_ADD(ctdb->db_list, ctdb_db);
 
+       /* setting this can help some high churn databases */
+       tdb_set_max_dead(ctdb_db->ltdb->tdb, ctdb->tunable.database_max_dead);
+
        /* 
           all databases support the "null" function. we need this in
           order to do forced migration of records
        */
        ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_null_func, CTDB_NULL_FUNC);
        if (ret != 0) {
-               DEBUG(0,("Failed to setup null function for '%s'\n", ctdb_db->db_name));
+               DEBUG(DEBUG_CRIT,("Failed to setup null function for '%s'\n", ctdb_db->db_name));
                talloc_free(ctdb_db);
                return -1;
        }
@@ -275,36 +724,342 @@ int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, TDB_DATA indata,
        */
        ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_func, CTDB_FETCH_FUNC);
        if (ret != 0) {
-               DEBUG(0,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+               DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       ret = ctdb_vacuum_init(ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for "
+                                 "database '%s'\n", ctdb_db->db_name));
                talloc_free(ctdb_db);
                return -1;
        }
+
+
+       DEBUG(DEBUG_INFO,("Attached to database '%s'\n", ctdb_db->db_path));
        
+       /* success */
+       return 0;
+}
+
+
+/*
+  a client has asked to attach a new database
+ */
+int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, TDB_DATA indata,
+                              TDB_DATA *outdata, uint64_t tdb_flags, 
+                              bool persistent, uint32_t client_id)
+{
+       const char *db_name = (const char *)indata.dptr;
+       struct ctdb_db_context *db;
+       struct ctdb_node *node;
+
+       /* dont allow any local clients to attach while we are in recovery mode
+        * except for the recovery daemon.
+        * allow all attach from the network since these are always from remote
+        * recovery daemons.
+        */
+       if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE && client_id != 0) {
+               struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+
+               if (client != NULL && client->pid != ctdb->recoverd_pid) {
+                       DEBUG(DEBUG_ERR,("DB Attach to database %s refused for client with pid:%d since node is in recovery mode.\n", db_name, client->pid));
+                       return -1;
+               }
+       }
+
+       node = ctdb->nodes[ctdb->pnn];
+
+       /* the client can optionally pass additional tdb flags, but we
+          only allow a subset of those on the database in ctdb. Note
+          that tdb_flags is passed in via the (otherwise unused)
+          srvid to the attach control */
+       tdb_flags &= (TDB_NOSYNC|TDB_INCOMPATIBLE_HASH);
+
+       /* If the node is inactive it is not part of the cluster
+          and we should not allow clients to attach to any
+          databases
+       */
+       if (node->flags & NODE_FLAGS_INACTIVE) {
+               DEBUG(DEBUG_ERR,("DB Attach to database %s refused since node is inactive (disconnected or banned)\n", db_name));
+               return -1;
+       }
+
+
+       /* see if we already have this name */
+       db = ctdb_db_handle(ctdb, db_name);
+       if (db) {
+               outdata->dptr  = (uint8_t *)&db->db_id;
+               outdata->dsize = sizeof(db->db_id);
+               tdb_add_flags(db->ltdb->tdb, tdb_flags);
+               return 0;
+       }
+
+       if (ctdb_local_attach(ctdb, db_name, persistent, NULL, (tdb_flags&TDB_INCOMPATIBLE_HASH)?true:false) != 0) {
+               return -1;
+       }
+
+       db = ctdb_db_handle(ctdb, db_name);
+       if (!db) {
+               DEBUG(DEBUG_ERR,("Failed to find db handle for name '%s'\n", db_name));
+               return -1;
+       }
+
+       /* remember the flags the client has specified */
+       tdb_add_flags(db->ltdb->tdb, tdb_flags);
+
+       outdata->dptr  = (uint8_t *)&db->db_id;
+       outdata->dsize = sizeof(db->db_id);
+
+       /* Try to ensure it's locked in mem */
+       ctdb_lockdown_memory(ctdb);
+
        /* tell all the other nodes about this database */
-       ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
-                                CTDB_CONTROL_DB_ATTACH, 0, CTDB_CTRL_FLAG_NOREPLY,
+       ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, tdb_flags,
+                                persistent?CTDB_CONTROL_DB_ATTACH_PERSISTENT:
+                                               CTDB_CONTROL_DB_ATTACH,
+                                0, CTDB_CTRL_FLAG_NOREPLY,
                                 indata, NULL, NULL);
 
-       DEBUG(1,("Attached to database '%s'\n", ctdb_db->db_path));
-
        /* success */
        return 0;
 }
 
+
+/*
+  attach to all existing persistent databases
+ */
+static int ctdb_attach_persistent(struct ctdb_context *ctdb,
+                                 const char *unhealthy_reason)
+{
+       DIR *d;
+       struct dirent *de;
+
+       /* open the persistent db directory and scan it for files */
+       d = opendir(ctdb->db_directory_persistent);
+       if (d == NULL) {
+               return 0;
+       }
+
+       while ((de=readdir(d))) {
+               char *p, *s, *q;
+               size_t len = strlen(de->d_name);
+               uint32_t node;
+               int invalid_name = 0;
+               
+               s = talloc_strdup(ctdb, de->d_name);
+               CTDB_NO_MEMORY(ctdb, s);
+
+               /* only accept names ending in .tdb */
+               p = strstr(s, ".tdb.");
+               if (len < 7 || p == NULL) {
+                       talloc_free(s);
+                       continue;
+               }
+
+               /* only accept names ending with .tdb. and any number of digits */
+               q = p+5;
+               while (*q != 0 && invalid_name == 0) {
+                       if (!isdigit(*q++)) {
+                               invalid_name = 1;
+                       }
+               }
+               if (invalid_name == 1 || sscanf(p+5, "%u", &node) != 1 || node != ctdb->pnn) {
+                       DEBUG(DEBUG_ERR,("Ignoring persistent database '%s'\n", de->d_name));
+                       talloc_free(s);
+                       continue;
+               }
+               p[4] = 0;
+
+               if (ctdb_local_attach(ctdb, s, true, unhealthy_reason, 0) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to attach to persistent database '%s'\n", de->d_name));
+                       closedir(d);
+                       talloc_free(s);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_INFO,("Attached to persistent database %s\n", s));
+
+               talloc_free(s);
+       }
+       closedir(d);
+       return 0;
+}
+
+int ctdb_attach_databases(struct ctdb_context *ctdb)
+{
+       int ret;
+       char *persistent_health_path = NULL;
+       char *unhealthy_reason = NULL;
+       bool first_try = true;
+
+       if (ctdb->db_directory == NULL) {
+               ctdb->db_directory = VARDIR "/ctdb";
+       }
+       if (ctdb->db_directory_persistent == NULL) {
+               ctdb->db_directory_persistent = VARDIR "/ctdb/persistent";
+       }
+       if (ctdb->db_directory_state == NULL) {
+               ctdb->db_directory_state = VARDIR "/ctdb/state";
+       }
+
+       /* make sure the db directory exists */
+       ret = mkdir(ctdb->db_directory, 0700);
+       if (ret == -1 && errno != EEXIST) {
+               DEBUG(DEBUG_CRIT,(__location__ " Unable to create ctdb directory '%s'\n",
+                        ctdb->db_directory));
+               return -1;
+       }
+
+       /* make sure the persistent db directory exists */
+       ret = mkdir(ctdb->db_directory_persistent, 0700);
+       if (ret == -1 && errno != EEXIST) {
+               DEBUG(DEBUG_CRIT,(__location__ " Unable to create ctdb persistent directory '%s'\n",
+                        ctdb->db_directory_persistent));
+               return -1;
+       }
+
+       /* make sure the internal state db directory exists */
+       ret = mkdir(ctdb->db_directory_state, 0700);
+       if (ret == -1 && errno != EEXIST) {
+               DEBUG(DEBUG_CRIT,(__location__ " Unable to create ctdb state directory '%s'\n",
+                        ctdb->db_directory_state));
+               return -1;
+       }
+
+       persistent_health_path = talloc_asprintf(ctdb, "%s/%s.%u",
+                                                ctdb->db_directory_state,
+                                                PERSISTENT_HEALTH_TDB,
+                                                ctdb->pnn);
+       if (persistent_health_path == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+
+again:
+
+       ctdb->db_persistent_health = tdb_wrap_open(ctdb, persistent_health_path,
+                                                  0, TDB_DISALLOW_NESTING,
+                                                  O_CREAT | O_RDWR, 0600);
+       if (ctdb->db_persistent_health == NULL) {
+               struct tdb_wrap *tdb;
+
+               if (!first_try) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+               first_try = false;
+
+               unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+                                                  persistent_health_path,
+                                                  "was cleared after a failure",
+                                                  "manual verification needed");
+               if (unhealthy_reason == NULL) {
+                       DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+                       talloc_free(persistent_health_path);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - retrying after CLEAR_IF_FIRST\n",
+                                 persistent_health_path));
+               tdb = tdb_wrap_open(ctdb, persistent_health_path,
+                                   0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+                                   O_CREAT | O_RDWR, 0600);
+               if (tdb) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+
+               talloc_free(tdb);
+               goto again;
+       }
+       ret = tdb_check(ctdb->db_persistent_health->tdb, NULL, NULL);
+       if (ret != 0) {
+               struct tdb_wrap *tdb;
+
+               talloc_free(ctdb->db_persistent_health);
+               ctdb->db_persistent_health = NULL;
+
+               if (!first_try) {
+                       DEBUG(DEBUG_CRIT,("tdb_check('%s') failed\n",
+                                         persistent_health_path));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+               first_try = false;
+
+               unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+                                                  persistent_health_path,
+                                                  "was cleared after a failure",
+                                                  "manual verification needed");
+               if (unhealthy_reason == NULL) {
+                       DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+                       talloc_free(persistent_health_path);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_CRIT,("tdb_check('%s') failed - retrying after CLEAR_IF_FIRST\n",
+                                 persistent_health_path));
+               tdb = tdb_wrap_open(ctdb, persistent_health_path,
+                                   0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+                                   O_CREAT | O_RDWR, 0600);
+               if (tdb) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+
+               talloc_free(tdb);
+               goto again;
+       }
+       talloc_free(persistent_health_path);
+
+       ret = ctdb_attach_persistent(ctdb, unhealthy_reason);
+       talloc_free(unhealthy_reason);
+       if (ret != 0) {
+               return ret;
+       }
+
+       return 0;
+}
+
 /*
   called when a broadcast seqnum update comes in
  */
 int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode)
 {
        struct ctdb_db_context *ctdb_db;
-       if (srcnode == ctdb->vnn) {
+       if (srcnode == ctdb->pnn) {
                /* don't update ourselves! */
                return 0;
        }
 
        ctdb_db = find_ctdb_db(ctdb, db_id);
        if (!ctdb_db) {
-               DEBUG(0,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id));
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id));
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n",
+                                ctdb_db->db_name, ctdb_db->unhealthy_reason));
                return -1;
        }
 
@@ -334,9 +1089,9 @@ static void ctdb_ltdb_seqnum_check(struct event_context *ev, struct timed_event
        ctdb_db->seqnum = new_seqnum;
 
        /* setup a new timer */
-       ctdb_db->te = 
+       ctdb_db->seqnum_update =
                event_add_timed(ctdb->ev, ctdb_db, 
-                               timeval_current_ofs(ctdb->tunable.seqnum_frequency, 0),
+                               timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, (ctdb->tunable.seqnum_interval%1000)*1000),
                                ctdb_ltdb_seqnum_check, ctdb_db);
 }
 
@@ -348,14 +1103,14 @@ int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id)
        struct ctdb_db_context *ctdb_db;
        ctdb_db = find_ctdb_db(ctdb, db_id);
        if (!ctdb_db) {
-               DEBUG(0,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id));
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id));
                return -1;
        }
 
-       if (ctdb_db->te == NULL) {
-               ctdb_db->te = 
+       if (ctdb_db->seqnum_update == NULL) {
+               ctdb_db->seqnum_update =
                        event_add_timed(ctdb->ev, ctdb_db, 
-                                       timeval_current_ofs(ctdb->tunable.seqnum_frequency, 0),
+                                       timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, (ctdb->tunable.seqnum_interval%1000)*1000),
                                        ctdb_ltdb_seqnum_check, ctdb_db);
        }
 
@@ -364,3 +1119,26 @@ int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id)
        return 0;
 }
 
+int32_t ctdb_control_set_db_priority(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_db_priority *db_prio = (struct ctdb_db_priority *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_db = find_ctdb_db(ctdb, db_prio->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_set_db_priority\n", db_prio->db_id));
+               return -1;
+       }
+
+       if ((db_prio->priority<1) || (db_prio->priority>NUM_DB_PRIORITIES)) {
+               DEBUG(DEBUG_ERR,("Trying to set invalid priority : %u\n", db_prio->priority));
+               return -1;
+       }
+
+       ctdb_db->priority = db_prio->priority;
+       DEBUG(DEBUG_INFO,("Setting DB priority to %u for db 0x%08x\n", db_prio->priority, db_prio->db_id));
+
+       return 0;
+}
+
+