Eventscripts: lower the fail/restart limits for nfsd.

[sahlberg/ctdb.git] / config / events.d / 60.nfs
diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs

index 8c9e7ebd780ed60502c1ead557fd908ebdb2e14f..79a071bab913f6ac71c74fdd42062b7c5c3bd0d2 100755 (executable)
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -1,119 +1,206 @@
  #!/bin/sh
  # script to manage nfs in a clustered environment
  
-. /etc/ctdb/functions
-loadconfig nfs
+start_nfs() {
+       /bin/mkdir -p $CTDB_VARDIR/state/nfs
+       /bin/mkdir -p $CTDB_VARDIR/state/statd/ip
+       startstop_nfs stop
+       startstop_nfs start
+       echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle
+}
  
-[ "$CTDB_MANAGES_NFS" = "yes" ] || exit 0
-[ -z "$STATD_SHARED_DIRECTORY" ] && exit 0
+. $CTDB_BASE/functions
  
-cmd="$1"
-shift
+service_name="nfs"
+service_start="start_nfs"
+service_stop="startstop_nfs stop"
+service_reconfigure="startstop_nfs restart"
  
-PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
-
-case $cmd in 
-     startup)
-       /bin/mkdir -p /etc/ctdb/state/nfs
-       /bin/mkdir -p /etc/ctdb/state/statd/ip
-       /bin/mkdir -p $STATD_SHARED_DIRECTORY
-       ctdb_wait_directories "nfslock" "$STATD_SHARED_DIRECTORY"
+loadconfig
  
+[ "$NFS_SERVER_MODE" != "GANESHA" ] || exit 0
  
-       # wait for all nfs exported directories to become available
-       nfs_dirs=`grep -v '^#' < /etc/exports | cut -d' ' -f1`
-       ctdb_wait_directories "NFS" $nfs_dirs
+ctdb_start_stop_service
  
-       # make sure nfs is stopped before we start it, or it may get a bind error
-       service nfs stop > /dev/null 2>&1
-       service nfslock stop > /dev/null 2>&1
+is_ctdb_managed_service || exit 0
  
-       service nfslock start
-       service nfs start
+case "$1" in 
+     init)
+       # read statd from persistent database
+       ;;
+     startup)
+       ctdb_service_start
+       mkdir -p $CTDB_VARDIR/state/statd
+       touch $CTDB_VARDIR/state/statd/update-trigger
         ;;
  
       shutdown)
-       service nfs stop
-       service nfslock stop
-       exit 0
+       ctdb_service_stop
         ;;
  
       takeip)
-       ip=$2
-
-       echo $ip >> /etc/ctdb/state/statd/restart
-
-       # having a list of what IPs we have allows statd to do the right 
-       # thing via /etc/ctdb/statd-callout
-       /bin/touch /etc/ctdb/state/statd/ip/$ip
-       exit 0
+       ctdb_service_set_reconfigure
         ;;
  
       releaseip)
-       iface=$1
-       ip=$2
-       maskbits=$3
-
-       echo $ip >> /etc/ctdb/state/statd/restart
-
-       /bin/rm -f /etc/ctdb/state/statd/ip/$ip
-
-       # RST all tcp connections to the lockmanager
-       [ ! -z "$LOCKD_TCPPORT" ] && {
-               # RST all tcp connections used for NLM to ensure that they do
-               # not survive in ESTABLISHED state across a failover/failback
-               # and create an ack storm
-               netstat -tn |egrep "^tcp.*\s+$ip:${LOCKD_TCPPORT}\s+.*ESTABLISHED" | awk '{print $4" "$5}' | while read dest src; do
-                       srcip=`echo $src | cut -d: -f1`
-                       srcport=`echo $src | cut -d: -f2`
-                       destip=`echo $dest | cut -d: -f1`
-                       destport=`echo $dest | cut -d: -f2`
-                       ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 
-#                      ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1
-               done
-       } > /dev/null 2>&1
-
-
-        # RST the local side for all tcp connections used for NFS to ensure 
-       # that they do not survive in ESTABLISHED state across a 
-       # failover/failback and create an ack storm
-       netstat -tn |egrep "^tcp.*\s+$ip:2049\s+.*ESTABLISHED" | awk '{print $4" "$5}' | while read dest src; do
-               srcip=`echo $src | cut -d: -f1`
-               srcport=`echo $src | cut -d: -f2`
-               destip=`echo $dest | cut -d: -f1`
-               destport=`echo $dest | cut -d: -f2`
-               ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 
-#              ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1
-       done
-
-       exit 0
-       ;;
-
-     recovered)
-       # always restart the lockmanager so that we start with a clusterwide
-       # graceperiod when ip addresses has changed
-       [ -x /etc/ctdb/statd-callout ] && {
-               /etc/ctdb/statd-callout notify &
-       } >/dev/null 2>&1
-
-       /bin/rm -f /etc/ctdb/state/statd/restart
+       ctdb_service_set_reconfigure
         ;;
  
        monitor)
+       if ctdb_service_needs_reconfigure ; then
+           ctdb_service_reconfigure
+           exit 0
+       fi
+
+       update_tickles 2049
+
+       # check that statd responds to rpc requests
+       # if statd is not running we try to restart it
+       # we only do this IF we have a rpc.statd command.
+       # For platforms where rpc.statd does not exist, we skip
+       # the check completely
+       p="rpc.statd"
+       which $p >/dev/null 2>/dev/null && {
+               if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+                       (service_name="nfs_statd"; ctdb_counter_init)
+               else
+                       cmd="$p"
+                       cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+                       cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+                       cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+                       (
+                               service_name="nfs_statd"
+                               ctdb_counter_incr
+                               ctdb_check_counter_limit 10 quiet >/dev/null
+                       ) || {
+                               echo "$ctdb_check_rpc_out"
+                               echo "Trying to restart STATD [$cmd]"
+                               $cmd
+                       }
+               fi
+       }
+
         # check that NFS responds to rpc requests
-       ctdb_check_rpc "NFS" 100003 3
-       ctdb_check_rpc "mount" 100005 1
+       [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
+           if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+               (service_name="nfs_knfsd"; ctdb_counter_init)
+           else
+               (
+                       service_name="nfs_knfsd"
+                       ctdb_counter_incr
+
+                       ctdb_check_counter_equal 2 || {
+                               echo "Trying to restart NFS service"
+                               startstop_nfs restart >/dev/null 2>&1 &
+                               exit 0
+                       }
+
+                       ctdb_check_counter_limit 5 quiet >/dev/null
+               ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart NFS service"
+                       startstop_nfs restart
+                       exit 1
+               }
+           fi
+       }
  
         # and that its directories are available
-       nfs_dirs=`grep -v '^#' < /etc/exports | cut -d' ' -f1`
-       ctdb_check_directories "nfs" $nfs_dirs
+       [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
+           exportfs | grep -v '^#' | grep '^/' |
+           sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
+           ctdb_check_directories
+       } || exit $?
  
         # check that lockd responds to rpc requests
-       ctdb_check_rpc "statd" 100024 1
-       ctdb_check_rpc "lockd" 100021 1
-       ctdb_check_directories "statd" $STATD_SHARED_DIRECTORY
+       if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
+               (service_name="lockd"; ctdb_counter_init)
+       else
+               (
+                       service_name="lockd"
+                       ctdb_counter_incr
+
+                       ctdb_check_counter_equal 10 || {
+                               echo "Trying to restart NFS lock service"
+                               startstop_nfs restart >/dev/null 2>&1 &
+                               startstop_nfslock restart  >/dev/null 2>&1 &
+                               exit 0
+                       }
+
+                       ctdb_check_counter_limit 15 quiet >/dev/null
+       ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart NFS lock service"
+                       startstop_nfs restart
+                       startstop_nfslock restart
+                       exit 1
+               }
+       fi
+
+       # mount needs special handling since it is sometimes not started
+       # correctly on RHEL5
+       if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
+               (service_name="nfs_mountd"; ctdb_counter_init)
+       else
+       (
+               service_name="nfs_mountd"
+               ctdb_counter_incr
+
+               ctdb_check_counter_equal 5 || {
+                       p="rpc.mountd"
+                       cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+                       echo "Trying to restart MOUNTD [${cmd}]"
+                       killall -q -9 $p
+                       $cmd &
+                       exit 0
+               }
+
+               ctdb_check_counter_limit 10 quiet >/dev/null
+       ) || {
+               echo "$ctdb_check_rpc_out"
+               p="rpc.mountd"
+               cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+               echo "Trying to restart MOUNTD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
+               exit 1
+       }
+       fi
+
+
+       # rquotad needs special handling since it is sometimes not started
+       # correctly on RHEL5
+       # this is not a critical service so we dont flag the node as unhealthy
+       ctdb_check_rpc "RQUOTAD" 100011 1 || {
+               p="rpc.rquotad"
+               cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+               echo "Trying to restart RQUOTAD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
+       }
+
+       # once every 60 seconds, update the statd state database for which
+       # clients need notifications
+       LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger 2>/dev/null`
+       CURRENT_TIME=`date +"%s"`
+       [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && {
+           mkdir -p $CTDB_VARDIR/state/statd
+           touch $CTDB_VARDIR/state/statd/update-trigger
+           $CTDB_BASE/statd-callout updatelocal &
+           $CTDB_BASE/statd-callout updateremote &
+       }
+               ;;
+
+    ipreallocated)
+       # if the ips have been reallocated, we must restart the lockmanager
+       # across all nodes and ping all statd listeners
+       [ -x $CTDB_BASE/statd-callout ] && {
+               $CTDB_BASE/statd-callout notify &
+       } >/dev/null 2>&1
+       ;;
+    *)
+       ctdb_standard_event_handler "$@"
         ;;
-
  esac
  
  exit 0