From 79c25fe241cf5d8f92e23d3736823ebaf4e1769d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Tue, 16 Nov 2010 19:31:18 +1100 Subject: [PATCH 1/1] 60.nfs only fails or warns after 10 consecutive nfsd/statd failures. These failures are sometimes the result of slow restarts so we want to avoid dirtying the logs or marking a node unhealthy because of them, unless they are excessive. For these 2 cases we use the existing fail counting code but hack a temporary service_name in a subshell to allow separate fail counts. We also update ctdb_check_rpc() so that it captures the error output from rpcinfo and we add a message including the service name to the beginning. The error is printed to stdout but is also stored in ctdb_check_rpc_out to allow it to be conditionally used by the caller. This function also now returns non-zero rather than exiting on failure. Other direct rpcinfo calls are relaced by called to ctdb_check_rpc() for consistency. Option handling code for service restarts is cleaned up so that fits in 80 columns. A more informative restart messageis now used in all cases, printing the exact command being used to start a service. Signed-off-by: Martin Schwenke --- config/events.d/60.nfs | 74 ++++++++++++++++++++++++++---------------- config/functions | 12 ++++--- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs index 57c81d35..f424f8cb 100755 --- a/config/events.d/60.nfs +++ b/config/events.d/60.nfs @@ -51,24 +51,43 @@ case "$1" in # check that statd responds to rpc requests # if statd is not running we try to restart it - rpcinfo -u localhost 100024 1 > /dev/null || { - RPCSTATDOPTS="" - [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME" - [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT" - [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT" - rpc.statd $RPCSTATDOPTS - echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]" - } + if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then + (service_name="nfs_statd"; ctdb_counter_init) + else + p="rpc.statd" ; cmd="$p" + cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" + cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}" + cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" + ( + service_name="nfs_statd" + ctdb_counter_incr + ctdb_check_counter_limit 10 quiet >/dev/null + ) || { + echo "$ctdb_check_rpc_out" + echo "Trying to restart STATD [$cmd]" + } + $cmd + fi # check that NFS responds to rpc requests [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || { - (ctdb_check_rpc "NFS" 100003 3) - [ $? = "0" ] || { - echo "Trying to restart NFS service" - startstop_nfs restart - exit 1 - } + if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then + (service_name="nfs_knfsd"; ctdb_counter_init) + else + ( + service_name="nfs_knfsd" + ctdb_counter_incr + ctdb_check_counter_limit 10 quiet >/dev/null + ) || { + echo "$ctdb_check_rpc_out" + echo "Trying to restart NFS service" + startstop_nfs restart + exit 1 + } + # we haven't hit the failure limit so restart quietly + startstop_nfs restart >/dev/null 2>&1 & + fi } # and that its directories are available @@ -79,8 +98,7 @@ case "$1" in } || exit $? # check that lockd responds to rpc requests - (ctdb_check_rpc "lockd" 100021 1) - [ $? = "0" ] || { + ctdb_check_rpc "LOCKD" 100021 1 || { echo "Trying to restart lock manager service" startstop_nfs restart startstop_nfslock restart @@ -89,23 +107,23 @@ case "$1" in # mount needs special handling since it is sometimes not started # correctly on RHEL5 - rpcinfo -u localhost 100005 1 > /dev/null || { - echo "ERROR: MOUNTD is not running. Trying to restart it." - RPCMOUNTDOPTS="" - [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT" - killall -q -9 rpc.mountd - rpc.mountd $RPCMOUNTDOPTS & + ctdb_check_rpc "MOUNTD" 100005 1 || { + p="rpc.mountd" + cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" + echo "Trying to restart MOUNTD [${cmd}]" + killall -q -9 $p + $cmd & exit 1 } # rquotad needs special handling since it is sometimes not started # correctly on RHEL5 # this is not a critical service so we dont flag the node as unhealthy - rpcinfo -u localhost 100011 1 > /dev/null || { - echo "ERROR: RQUOTAD is not running. Trying to restart it." - RPCRQUOTADOPTS="" - [ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT" - killall -q -9 rpc.rquotad - rpc.rquotad $RPCRQUOTADOPTS & + ctdb_check_rpc "RQUOTAD" 100011 1 || { + p="rpc.rquotad" + cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" + echo "Trying to restart RQUOTAD [${cmd}]" + killall -q -9 $p + $cmd & } # once every 60 seconds, update the statd state database for which diff --git a/config/functions b/config/functions index 9659d48d..854060a6 100755 --- a/config/functions +++ b/config/functions @@ -144,10 +144,14 @@ ctdb_check_rpc() { progname="$1" prognum="$2" version="$3" - rpcinfo -u localhost $prognum $version > /dev/null || { - echo "ERROR: $progname not responding to rpc requests" - exit 1 - } + + ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1) + if [ $? -ne 0 ] ; then + ctdb_check_rpc_out="ERROR: $progname failed RPC check: +$ctdb_check_rpc_out" + echo "$ctdb_check_rpc_out" + return 1 + fi } ###################################################### -- 2.34.1