first step in health monitoring of cluster nodes. When not healthy they will be marke...
authorAndrew Tridgell <tridge@samba.org>
Tue, 5 Jun 2007 07:43:19 +0000 (17:43 +1000)
committerAndrew Tridgell <tridge@samba.org>
Tue, 5 Jun 2007 07:43:19 +0000 (17:43 +1000)
common/ctdb.c
common/ctdb_client.c
common/ctdb_recoverd.c
common/ctdb_traverse.c
common/ctdb_tunables.c
config/events.d/50.samba
include/ctdb_private.h
takeover/ctdb_takeover.c
tools/ctdb_control.c

index 3fc2d7d53da8247673862cfeada29ea299ff9ba9..354b0f64ceefb6cefc262f08cccbe6cfc85fd657 100644 (file)
@@ -222,14 +222,16 @@ uint32_t ctdb_get_vnn(struct ctdb_context *ctdb)
 }
 
 /*
-  return the number of connected nodes
+  return the number of enabled nodes
 */
-uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb)
+uint32_t ctdb_get_num_enabled_nodes(struct ctdb_context *ctdb)
 {
        int i;
        uint32_t count=0;
        for (i=0;i<ctdb->vnn_map->size;i++) {
-               if (ctdb->nodes[ctdb->vnn_map->map[i]]->flags & NODE_FLAGS_CONNECTED) {
+               struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
+               if ((node->flags & NODE_FLAGS_CONNECTED) &&
+                   !(node->flags & NODE_FLAGS_DISABLED)) {
                        count++;
                }
        }
index eed404341486eae455dc7140d564031c4dc43691..5662c61720edd975c988bf774814ddc85c42e64b 100644 (file)
@@ -1364,7 +1364,7 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name)
        ctdb_db->db_id = *(uint32_t *)data.dptr;
        talloc_free(data.dptr);
 
-       ret = ctdb_ctrl_getdbpath(ctdb, timeval_current_ofs(1, 0), CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
+       ret = ctdb_ctrl_getdbpath(ctdb, timeval_current_ofs(2, 0), CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
        if (ret != 0) {
                DEBUG(0,("Failed to get dbpath for database '%s'\n", name));
                talloc_free(ctdb_db);
index a2d845de63d6a4fc389876f1d6518c1454885efc..36ecb9f2390bc76df2f33f5c4c48fea2038b5be1 100644 (file)
@@ -697,6 +697,8 @@ again:
                              "MonitorFrequency", &ctdb->tunable.monitor_frequency);
        ctdb_ctrl_get_tunable(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 
                              "ElectionTimeout", &ctdb->tunable.election_timeout);
+       ctdb_ctrl_get_tunable(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 
+                             "TakeoverTimeout", &ctdb->tunable.takeover_timeout);
 
        vnn = ctdb_ctrl_getvnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
        if (vnn == (uint32_t)-1) {
index bceff7f0243643c574b6b8bca794aaf77cd3f22d..721448631efc9beff489b5abc94fdfece589fcd6 100644 (file)
@@ -372,7 +372,7 @@ int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB
 
        if (key.dsize == 0 && data.dsize == 0) {
                state->null_count++;
-               if (state->null_count != ctdb_get_num_connected_nodes(ctdb)) {
+               if (state->null_count != ctdb_get_num_enabled_nodes(ctdb)) {
                        return 0;
                }
        }
index 85794432b300c5e43963ea696873dc6dfaf8091a..7b8774c8de9edf42469ddfb86684f60b9fc54394 100644 (file)
@@ -35,6 +35,7 @@ static const struct {
        { "RecoverTimeout",    5,  offsetof(struct ctdb_tunable, recover_timeout) },
        { "MonitorFrequency",  1,  offsetof(struct ctdb_tunable, monitor_frequency) },
        { "ElectionTimeout",   3,  offsetof(struct ctdb_tunable, election_timeout) },
+       { "TakeoverTimeout",   5,  offsetof(struct ctdb_tunable, takeover_timeout) },
 };
 
 /*
index 0deea3f182ae7ccbccca60eecf274b199f15170b..affd964c7defcb7056c45bcf6f55f3457fd760e8 100755 (executable)
@@ -21,8 +21,10 @@ case $cmd in
        service smb stop > /dev/null 2>&1
        service winbind stop > /dev/null 2>&1
 
-       # start Samba service
-       service smb start
+       # start Samba service. Start it reniced, as under very heavy load 
+       # the number of smbd processes will mean that it leaves few cycles for
+       # anything else
+       nice service smb start
        service winbind start
 
        # wait for the Samba tcp ports to become available
index 7fe6af6bbf30082e5ee4e211ee7cfc0c3219798d..25fb1c685e94cedcc10647c0c8b8dd4393601452 100644 (file)
@@ -50,6 +50,7 @@ struct ctdb_tunable {
        uint32_t recover_timeout;
        uint32_t monitor_frequency;
        uint32_t election_timeout;
+       uint32_t takeover_timeout;
 };
 
 /*
@@ -109,6 +110,7 @@ struct ctdb_node {
        void *private_data; /* private to transport */
        uint32_t vnn;
 #define NODE_FLAGS_CONNECTED 0x00000001
+#define NODE_FLAGS_DISABLED  0x00000002
        uint32_t flags;
 
        /* used by the dead node monitoring */
@@ -905,7 +907,7 @@ int32_t ctdb_control_thaw(struct ctdb_context *ctdb);
 
 int ctdb_start_recoverd(struct ctdb_context *ctdb);
 
-uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb);
+uint32_t ctdb_get_num_enabled_nodes(struct ctdb_context *ctdb);
 
 int ctdb_start_monitoring(struct ctdb_context *ctdb);
 void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
index 5e0b02513a0eab9de5b0e305a27f125cae1d4808..6f6b46f03d209ff6a40c24c205559cd782296703 100644 (file)
@@ -27,7 +27,7 @@
 #include "../include/ctdb_private.h"
 
 
-#define TAKEOVER_TIMEOUT() timeval_current_ofs(5,0)
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
 
 #define CTDB_ARP_INTERVAL 1
 #define CTDB_ARP_REPEAT   3
@@ -403,7 +403,8 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 
        /* work out which node will look after each public IP */
        for (i=0;i<nodemap->num;i++) {
-               if (nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) {
+               if ((nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) && 
+                   !(nodemap->nodes[i].flags & NODE_FLAGS_DISABLED)) {
                        ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
                } else {
                        /* assign this dead nodes IP to the next higher node */
@@ -411,6 +412,7 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
                             j != i;
                             j=(j+1)%nodemap->num) {
                                if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) &&
+                                   !(nodemap->nodes[j].flags & NODE_FLAGS_DISABLED) &&
                                    ctdb_same_subnet(ctdb->nodes[j]->public_address, 
                                                     ctdb->nodes[i]->public_address, 
                                                     ctdb->nodes[j]->public_netmask_bits)) {
index 800f707927c478b96deb434dc2d80b0b5ceaf3f9..9d15ff9f657b803570e8e2d435a7db7b3695c955 100644 (file)
@@ -383,7 +383,7 @@ static int control_shutdown(struct ctdb_context *ctdb, int argc, const char **ar
 {
        int ret;
 
-       ret = ctdb_ctrl_shutdown(ctdb, timeval_current_ofs(1, 0), options.vnn);
+       ret = ctdb_ctrl_shutdown(ctdb, TIMELIMIT(), options.vnn);
        if (ret != 0) {
                printf("Unable to shutdown node %u\n", options.vnn);
                return ret;