split node health monitoring and checking for connected/disconnected

author Ronnie Sahlberg <sahlberg@ronnie>

Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)

committer Ronnie Sahlberg <sahlberg@ronnie>

Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)
author Ronnie Sahlberg <sahlberg@ronnie>
Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)
committer Ronnie Sahlberg <sahlberg@ronnie>
Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)
diff --git a/Makefile.in b/Makefile.in

index ba5eab69dde1f64febcb5a050d842d98ea6766fe..b28ab9cbabb93239501559a4ec4d52a67dc7931f 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -51,6 +51,7 @@ CTDB_SERVER_OBJ = server/ctdbd.o server/ctdb_daemon.o server/ctdb_lockwait.o \
         server/ctdb_control.o server/ctdb_call.o server/ctdb_ltdb_server.o \
         server/ctdb_traverse.o server/eventscript.o server/ctdb_takeover.o \
         server/ctdb_serverids.o server/ctdb_persistent.o \
+       server/ctdb_keepalive.o \
         $(CTDB_CLIENT_OBJ) $(CTDB_TCP_OBJ) @INFINIBAND_WRAPPER_OBJ@
  
  TEST_BINS=bin/ctdb_bench bin/ctdb_fetch bin/ctdb_store bin/ctdb_randrec bin/ctdb_persistent bin/rb_test \
diff --git a/include/ctdb_private.h b/include/ctdb_private.h

index 62764a1fbc90a3491f01bfa756ee05688fe04c80..8780275769bab5f5a158a3558aeee7cdabd7cbee 100644 (file)
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -324,6 +324,7 @@ struct ctdb_context {
         struct event_context *ev;
         uint32_t recovery_mode;
         TALLOC_CTX *tickle_update_context;
+       TALLOC_CTX *keepalive_ctx;
         struct ctdb_tunable tunable;
         enum ctdb_freeze_mode freeze_mode;
         struct ctdb_freeze_handle *freeze_handle;
@@ -1059,6 +1060,8 @@ void ctdb_stop_monitoring(struct ctdb_context *ctdb);
  void ctdb_start_monitoring(struct ctdb_context *ctdb);
  void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
  void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
+void ctdb_start_keepalive(struct ctdb_context *ctdb);
+void ctdb_stop_keepalive(struct ctdb_context *ctdb);
  
  void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node);
  void ctdb_call_resend_all(struct ctdb_context *ctdb);
diff --git a/server/ctdb_control.c b/server/ctdb_control.c

index 35266bcde23931c3c08b8797042a7a63500921ff..c580f747391aaa7535ef0bf960ac210193309abc 100644 (file)
--- a/server/ctdb_control.c
+++ b/server/ctdb_control.c
@@ -228,6 +228,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
  
         case CTDB_CONTROL_SHUTDOWN:
                 ctdb_stop_recoverd(ctdb);
+               ctdb_stop_keepalive(ctdb);
                 ctdb_stop_monitoring(ctdb);
                 ctdb_release_all_ips(ctdb);
                 ctdb->methods->shutdown(ctdb);
diff --git a/server/ctdb_daemon.c b/server/ctdb_daemon.c

index 0362bb420f59ae3f7e3e896becccffba7ce2c4bc..a1d9de6eeb9763d63f446b36fcac56373b64ae7c 100644 (file)
--- a/server/ctdb_daemon.c
+++ b/server/ctdb_daemon.c
@@ -89,7 +89,10 @@ static void ctdb_start_transport(struct ctdb_context *ctdb)
         ctdb_register_message_handler(ctdb, ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, 
                                       flag_change_handler, NULL);
  
-       /* start monitoring for dead nodes */
+       /* start monitoring for connected/disconnected nodes */
+       ctdb_start_keepalive(ctdb);
+
+       /* start monitoring for node health */
         ctdb_start_monitoring(ctdb);
  
         /* start periodic update of tcp tickle lists */
diff --git a/server/ctdb_keepalive.c b/server/ctdb_keepalive.c

new file mode 100644 (file)

index 0000000..216127c
--- /dev/null
+++ b/server/ctdb_keepalive.c
@@ -0,0 +1,105 @@
+/* 
+   monitoring links to all other nodes to detect dead nodes
+
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+
+
+/*
+  see if any nodes are dead
+ */
+static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_event *te, 
+                                     struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int i;
+
+       /* send a keepalive to all other nodes, unless */
+       for (i=0;i<ctdb->num_nodes;i++) {
+               struct ctdb_node *node = ctdb->nodes[i];
+               if (node->pnn == ctdb->pnn) {
+                       continue;
+               }
+               
+               if (node->flags & NODE_FLAGS_DISCONNECTED) {
+                       /* it might have come alive again */
+                       if (node->rx_cnt != 0) {
+                               ctdb_node_connected(node);
+                       }
+                       continue;
+               }
+
+
+               if (node->rx_cnt == 0) {
+                       node->dead_count++;
+               } else {
+                       node->dead_count = 0;
+               }
+
+               node->rx_cnt = 0;
+
+               if (node->dead_count >= ctdb->tunable.keepalive_limit) {
+                       DEBUG(0,("dead count reached for node %u\n", node->pnn));
+                       ctdb_node_dead(node);
+                       ctdb_send_keepalive(ctdb, node->pnn);
+                       /* maybe tell the transport layer to kill the
+                          sockets as well?
+                       */
+                       continue;
+               }
+               
+               if (node->tx_cnt == 0) {
+                       DEBUG(5,("sending keepalive to %u\n", node->pnn));
+                       ctdb_send_keepalive(ctdb, node->pnn);
+               }
+
+               node->tx_cnt = 0;
+       }
+       
+       event_add_timed(ctdb->ev, ctdb->keepalive_ctx,
+                       timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
+                       ctdb_check_for_dead_nodes, ctdb);
+}
+
+
+void ctdb_start_keepalive(struct ctdb_context *ctdb)
+{
+       struct timed_event *te;
+
+       ctdb->keepalive_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx);
+
+       te = event_add_timed(ctdb->ev, ctdb->keepalive_ctx,
+                            timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
+                            ctdb_check_for_dead_nodes, ctdb);
+       CTDB_NO_MEMORY_FATAL(ctdb, te);
+
+       DEBUG(0,("Keepalive monitoring has been started\n"));
+}
+
+void ctdb_stop_keepalive(struct ctdb_context *ctdb)
+{
+       talloc_free(ctdb->keepalive_ctx);
+       ctdb->keepalive_ctx = NULL;
+}
+
diff --git a/server/ctdb_monitor.c b/server/ctdb_monitor.c

index 9120324f8493e492a954e4546929624eebf4a7f4..f798b92d6dbd4d6add682931022a0eb7d8e445cf 100644 (file)
--- a/server/ctdb_monitor.c
+++ b/server/ctdb_monitor.c
@@ -30,62 +30,6 @@ struct ctdb_monitor_state {
         uint32_t next_interval;
  };
  
-/*
-  see if any nodes are dead
- */
-static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_event *te, 
-                                     struct timeval t, void *private_data)
-{
-       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
-       int i;
-
-       /* send a keepalive to all other nodes, unless */
-       for (i=0;i<ctdb->num_nodes;i++) {
-               struct ctdb_node *node = ctdb->nodes[i];
-               if (node->pnn == ctdb->pnn) {
-                       continue;
-               }
-               
-               if (node->flags & NODE_FLAGS_DISCONNECTED) {
-                       /* it might have come alive again */
-                       if (node->rx_cnt != 0) {
-                               ctdb_node_connected(node);
-                       }
-                       continue;
-               }
-
-
-               if (node->rx_cnt == 0) {
-                       node->dead_count++;
-               } else {
-                       node->dead_count = 0;
-               }
-
-               node->rx_cnt = 0;
-
-               if (node->dead_count >= ctdb->tunable.keepalive_limit) {
-                       DEBUG(0,("dead count reached for node %u\n", node->pnn));
-                       ctdb_node_dead(node);
-                       ctdb_send_keepalive(ctdb, node->pnn);
-                       /* maybe tell the transport layer to kill the
-                          sockets as well?
-                       */
-                       continue;
-               }
-               
-               if (node->tx_cnt == 0) {
-                       DEBUG(5,("sending keepalive to %u\n", node->pnn));
-                       ctdb_send_keepalive(ctdb, node->pnn);
-               }
-
-               node->tx_cnt = 0;
-       }
-       
-       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, 
-                       timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
-                       ctdb_check_for_dead_nodes, ctdb);
-}
-
  static void ctdb_check_health(struct event_context *ev, struct timed_event *te, 
                               struct timeval t, void *private_data);
  
@@ -248,11 +192,6 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
         ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
  
-       te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
-                            timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
-                            ctdb_check_for_dead_nodes, ctdb);
-       CTDB_NO_MEMORY_FATAL(ctdb, te);
-
         te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
                              timeval_current_ofs(1, 0), 
                              ctdb_check_health, ctdb);
author	Ronnie Sahlberg <sahlberg@ronnie>
	Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)
committer	Ronnie Sahlberg <sahlberg@ronnie>
	Mon, 14 Jan 2008 21:42:12 +0000 (08:42 +1100)
Makefile.in		patch \| blob \| history
include/ctdb_private.h		patch \| blob \| history
server/ctdb_control.c		patch \| blob \| history
server/ctdb_daemon.c		patch \| blob \| history
server/ctdb_keepalive.c	[new file with mode: 0644]	patch \| blob
server/ctdb_monitor.c		patch \| blob \| history