From 79c25fe241cf5d8f92e23d3736823ebaf4e1769d Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Tue, 16 Nov 2010 19:31:18 +1100
Subject: [PATCH 1/1] 60.nfs only fails or warns after 10 consecutive
 nfsd/statd failures.

These failures are sometimes the result of slow restarts so we want to
avoid dirtying the logs or marking a node unhealthy because of them,
unless they are excessive.

For these 2 cases we use the existing fail counting code but hack a
temporary service_name in a subshell to allow separate fail counts.

We also update ctdb_check_rpc() so that it captures the error output
from rpcinfo and we add a message including the service name to the
beginning.  The error is printed to stdout but is also stored in
ctdb_check_rpc_out to allow it to be conditionally used by the caller.
This function also now returns non-zero rather than exiting on
failure.

Other direct rpcinfo calls are relaced by called to ctdb_check_rpc()
for consistency.

Option handling code for service restarts is cleaned up so that fits
in 80 columns.  A more informative restart messageis now used in all
cases, printing the exact command being used to start a service.

Signed-off-by: Martin Schwenke <martin@meltin.net>
---
 config/events.d/60.nfs | 74 ++++++++++++++++++++++++++----------------
 config/functions       | 12 ++++---
 2 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 57c81d35..f424f8cb 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -51,24 +51,43 @@ case "$1" in
 
 	# check that statd responds to rpc requests
 	# if statd is not running we try to restart it
-	rpcinfo -u localhost 100024 1 > /dev/null || {
-		RPCSTATDOPTS=""
-		[ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
-		[ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
-		[ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
-		rpc.statd $RPCSTATDOPTS 
-		echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
-	}
+	if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+		(service_name="nfs_statd"; ctdb_counter_init)
+	else
+		p="rpc.statd" ; cmd="$p"
+		cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+		cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+		cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+		(
+			service_name="nfs_statd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart STATD [$cmd]"
+		}
+		$cmd
+	fi
 
 
 	# check that NFS responds to rpc requests
 	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-	    (ctdb_check_rpc "NFS" 100003 3)
-	    [ $? = "0" ] || {
-		echo "Trying to restart NFS service"
-		startstop_nfs restart
-		exit 1
-	    }
+	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+		(service_name="nfs_knfsd"; ctdb_counter_init)
+	    else
+		(
+			service_name="nfs_knfsd"
+			ctdb_counter_incr
+			ctdb_check_counter_limit 10 quiet >/dev/null
+		) || {
+			echo "$ctdb_check_rpc_out"
+			echo "Trying to restart NFS service"
+			startstop_nfs restart
+			exit 1
+		}
+		# we haven't hit the failure limit so restart quietly
+		startstop_nfs restart >/dev/null 2>&1 &
+	    fi
 	}
 
 	# and that its directories are available
@@ -79,8 +98,7 @@ case "$1" in
 	} || exit $?
 
 	# check that lockd responds to rpc requests
-	(ctdb_check_rpc "lockd" 100021 1)
-	[ $? = "0" ] || {
+	ctdb_check_rpc "LOCKD" 100021 1 || {
 		echo "Trying to restart lock manager service"
 		startstop_nfs restart
 		startstop_nfslock restart
@@ -89,23 +107,23 @@ case "$1" in
 
 	# mount needs special handling since it is sometimes not started
 	# correctly on RHEL5
-	rpcinfo -u localhost 100005 1 > /dev/null || {
-		echo "ERROR: MOUNTD is not running. Trying to restart it."
-		RPCMOUNTDOPTS=""
-		[ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
-		killall -q -9 rpc.mountd
-		rpc.mountd $RPCMOUNTDOPTS &
+	ctdb_check_rpc "MOUNTD" 100005 1 || {
+		p="rpc.mountd"
+		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+		echo "Trying to restart MOUNTD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 		exit 1
 	}
 	# rquotad needs special handling since it is sometimes not started
 	# correctly on RHEL5
 	# this is not a critical service so we dont flag the node as unhealthy
-	rpcinfo -u localhost 100011 1 > /dev/null || {
-		echo "ERROR: RQUOTAD is not running. Trying to restart it."
-		RPCRQUOTADOPTS=""
-		[ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT"
-		killall -q -9 rpc.rquotad
-		rpc.rquotad $RPCRQUOTADOPTS &
+	ctdb_check_rpc "RQUOTAD" 100011 1 || {
+		p="rpc.rquotad"
+		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+		echo "Trying to restart RQUOTAD [${cmd}]"
+		killall -q -9 $p
+		$cmd &
 	}
 
 	# once every 60 seconds, update the statd state database for which
diff --git a/config/functions b/config/functions
index 9659d48d..854060a6 100755
--- a/config/functions
+++ b/config/functions
@@ -144,10 +144,14 @@ ctdb_check_rpc() {
     progname="$1"
     prognum="$2"
     version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-	    echo "ERROR: $progname not responding to rpc requests"
-	    exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+	echo "$ctdb_check_rpc_out"
+	return 1
+    fi
 }
 
 ######################################################
-- 
2.34.1