config/events.d/60.nfs

   1 #!/bin/sh
   2 # script to manage nfs in a clustered environment
   3
   4 . $CTDB_BASE/functions
   5 loadconfig ctdb
   6 loadconfig nfs
   7
   8 [ "$CTDB_MANAGES_NFS" = "yes" ] || exit 0
   9 [ -z "$STATD_SHARED_DIRECTORY" ] && exit 0
  10
  11 cmd="$1"
  12 shift
  13
  14 PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
  15
  16
  17
  18 case $cmd in
  19      startup)
  20         /bin/mkdir -p $CTDB_BASE/state/nfs
  21         /bin/mkdir -p $CTDB_BASE/state/statd/ip
  22         /bin/mkdir -p $STATD_SHARED_DIRECTORY
  23
  24         /bin/rm -f $CTDB_BASE/state/statd/statd.restart >/dev/null 2>/dev/null
  25
  26         # make sure nfs is stopped before we start it, or it may get a bind error
  27         startstop_nfs stop
  28         startstop_nfs start
  29         ;;
  30
  31      shutdown)
  32         startstop_nfs stop
  33         exit 0
  34         ;;
  35
  36      takeip)
  37         ip=$2
  38
  39         echo $ip >> $CTDB_BASE/state/statd/restart
  40
  41         # having a list of what IPs we have allows statd to do the right
  42         # thing via $CTDB_BASE/statd-callout
  43         touch $CTDB_BASE/state/statd/ip/$ip
  44         exit 0
  45         ;;
  46
  47      releaseip)
  48         iface=$1
  49         ip=$2
  50         maskbits=$3
  51
  52         echo $ip >> $CTDB_BASE/state/statd/restart
  53         /bin/rm -f $CTDB_BASE/state/statd/ip/$ip
  54         exit 0
  55         ;;
  56
  57      recovered)
  58         # if no IPs have changed then don't need to restart statd
  59         [ -f $CTDB_BASE/state/statd/restart ] || exit 0;
  60
  61         # always restart the lockmanager so that we start with a clusterwide
  62         # graceperiod when ip addresses has changed
  63         [ -x $CTDB_BASE/statd-callout ] && {
  64                 $CTDB_BASE/statd-callout notify &
  65         } >/dev/null 2>&1
  66
  67         /bin/rm -f $CTDB_BASE/state/statd/restart
  68         ;;
  69
  70       monitor)
  71         # check how many times in a row that nfsd has stopped responding
  72         # after 3 times in a row we try to restart the full nfs service
  73         NFSD_FAIL_COUNT_FILE=$CTDB_BASE/state/nfs/nfsd_fail_count
  74         [ ! -f $NFSD_FAIL_COUNT_FILE ] && {
  75                 echo 0 > $NFSD_FAIL_COUNT_FILE
  76         }
  77         NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
  78         [ -z "$NFSD_FAIL_COUNT" ] && {
  79                 echo 0 > $NFSD_FAIL_COUNT_FILE
  80                 NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
  81         }
  82         # ok it has failed a few times too many. try restarting it.
  83         [ `expr "$NFSD_FAIL_COUNT" ">" "3"` != "0" ] && {
  84                 echo 60.nfs NFSD: trying to restart NFSD...
  85                 echo 0 > $NFSD_FAIL_COUNT_FILE
  86                 service nfs restart
  87                 exit 0
  88         }
  89         expr "$NFSD_FAIL_COUNT" "+" "1" > $NFSD_FAIL_COUNT_FILE
  90
  91
  92         # check that statd responds to rpc requests
  93         # if statd is not running we try to restart it once and wait
  94         # for the next monitoring event to verify if it is running or not
  95         # if it still fails we fail and mark the node as UNHEALTHY
  96         if [ -f $CTDB_BASE/state/statd/statd.restart ]; then
  97                 # statd was restarted, see if it came up ok
  98                 rpcinfo -u localhost 100024 1 > /dev/null || {
  99                         echo "ERROR: Failed to restart STATD"
 100                         exit 1
 101                 }
 102                 echo "STATD successfully restarted."
 103                 /bin/rm -f $CTDB_BASE/state/statd/statd.restart
 104         else
 105                 rpcinfo -u localhost 100024 1 > /dev/null || {
 106                         RPCSTATDOPTS=""
 107                         [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
 108                         [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
 109                         [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
 110                         rpc.statd $RPCSTATDOPTS
 111                         echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
 112                         touch $CTDB_BASE/state/statd/statd.restart
 113                 }
 114         fi
 115
 116
 117
 118         # check that NFS responds to rpc requests
 119         ctdb_check_rpc "NFS" 100003 3
 120
 121         # and that its directories are available
 122         nfs_dirs=$(exportfs | grep -v '^#' | grep '^/' | awk {'print $1;'})
 123         ctdb_check_directories "nfs" $nfs_dirs
 124
 125         # check that lockd responds to rpc requests
 126         ctdb_check_rpc "lockd" 100021 1
 127         ctdb_check_directories "statd" $STATD_SHARED_DIRECTORY
 128
 129         # mount needs special handling since it is sometimes not started
 130         # correctly on RHEL5
 131         rpcinfo -u localhost 100005 1 > /dev/null || {
 132                 echo "ERROR: MOUNTD is not running. Trying to restart it."
 133                 RPCMOUNTDOPTS=""
 134                 [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
 135                 killall -q -9 rpc.mountd
 136                 rpc.mountd $RPCMOUNTDOPTS &
 137                 exit 1
 138         }
 139
 140         # everything was ok with nfs so reset the fail count back to 0
 141         echo 0 > $NFSD_FAIL_COUNT_FILE
 142         ;;
 143
 144 esac
 145
 146 exit 0