60.nfs only fails or warns after 10 consecutive nfsd/statd failures.

author Martin Schwenke <martin@meltin.net>

Tue, 16 Nov 2010 08:31:18 +0000 (19:31 +1100)

committer Ronnie Sahlberg <ronniesahlberg@gmail.com>

Wed, 17 Nov 2010 00:43:20 +0000 (11:43 +1100)
author Martin Schwenke <martin@meltin.net>
Tue, 16 Nov 2010 08:31:18 +0000 (19:31 +1100)
committer Ronnie Sahlberg <ronniesahlberg@gmail.com>
Wed, 17 Nov 2010 00:43:20 +0000 (11:43 +1100)
diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs

index 57c81d3520e4f913e7039ba793e587d364357013..f424f8cb2951759270a3bd59c98af2166e5d8ae5 100755 (executable)
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -51,24 +51,43 @@ case "$1" in
  
         # check that statd responds to rpc requests
         # if statd is not running we try to restart it
-       rpcinfo -u localhost 100024 1 > /dev/null || {
-               RPCSTATDOPTS=""
-               [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
-               [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
-               [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
-               rpc.statd $RPCSTATDOPTS 
-               echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
-       }
+       if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+               (service_name="nfs_statd"; ctdb_counter_init)
+       else
+               p="rpc.statd" ; cmd="$p"
+               cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+               cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+               cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+               (
+                       service_name="nfs_statd"
+                       ctdb_counter_incr
+                       ctdb_check_counter_limit 10 quiet >/dev/null
+               ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart STATD [$cmd]"
+               }
+               $cmd
+       fi
  
  
         # check that NFS responds to rpc requests
         [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-           (ctdb_check_rpc "NFS" 100003 3)
-           [ $? = "0" ] || {
-               echo "Trying to restart NFS service"
-               startstop_nfs restart
-               exit 1
-           }
+           if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+               (service_name="nfs_knfsd"; ctdb_counter_init)
+           else
+               (
+                       service_name="nfs_knfsd"
+                       ctdb_counter_incr
+                       ctdb_check_counter_limit 10 quiet >/dev/null
+               ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart NFS service"
+                       startstop_nfs restart
+                       exit 1
+               }
+               # we haven't hit the failure limit so restart quietly
+               startstop_nfs restart >/dev/null 2>&1 &
+           fi
         }
  
         # and that its directories are available
@@ -79,8 +98,7 @@ case "$1" in
         } || exit $?
  
         # check that lockd responds to rpc requests
-       (ctdb_check_rpc "lockd" 100021 1)
-       [ $? = "0" ] || {
+       ctdb_check_rpc "LOCKD" 100021 1 || {
                 echo "Trying to restart lock manager service"
                 startstop_nfs restart
                 startstop_nfslock restart
@@ -89,23 +107,23 @@ case "$1" in
  
         # mount needs special handling since it is sometimes not started
         # correctly on RHEL5
-       rpcinfo -u localhost 100005 1 > /dev/null || {
-               echo "ERROR: MOUNTD is not running. Trying to restart it."
-               RPCMOUNTDOPTS=""
-               [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
-               killall -q -9 rpc.mountd
-               rpc.mountd $RPCMOUNTDOPTS &
+       ctdb_check_rpc "MOUNTD" 100005 1 || {
+               p="rpc.mountd"
+               cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+               echo "Trying to restart MOUNTD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
                 exit 1
         }
         # rquotad needs special handling since it is sometimes not started
         # correctly on RHEL5
         # this is not a critical service so we dont flag the node as unhealthy
-       rpcinfo -u localhost 100011 1 > /dev/null || {
-               echo "ERROR: RQUOTAD is not running. Trying to restart it."
-               RPCRQUOTADOPTS=""
-               [ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT"
-               killall -q -9 rpc.rquotad
-               rpc.rquotad $RPCRQUOTADOPTS &
+       ctdb_check_rpc "RQUOTAD" 100011 1 || {
+               p="rpc.rquotad"
+               cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+               echo "Trying to restart RQUOTAD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
         }
  
         # once every 60 seconds, update the statd state database for which
diff --git a/config/functions b/config/functions

index 9659d48dbe316f0d552d79fa8f3dac17673c5dc6..854060a66492adf6b2720159b87a6576e2d0e546 100755 (executable)
--- a/config/functions
+++ b/config/functions
@@ -144,10 +144,14 @@ ctdb_check_rpc() {
      progname="$1"
      prognum="$2"
      version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-           echo "ERROR: $progname not responding to rpc requests"
-           exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+       ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+       echo "$ctdb_check_rpc_out"
+       return 1
+    fi
  }
  
  ######################################################
author	Martin Schwenke <martin@meltin.net>
	Tue, 16 Nov 2010 08:31:18 +0000 (19:31 +1100)
committer	Ronnie Sahlberg <ronniesahlberg@gmail.com>
	Wed, 17 Nov 2010 00:43:20 +0000 (11:43 +1100)
config/events.d/60.nfs		patch \| blob \| history
config/functions		patch \| blob \| history