update the monitor event for nfs to track how many times in a row it has failed
[sahlberg/ctdb.git] / config / events.d / 60.nfs
1 #!/bin/sh
2 # script to manage nfs in a clustered environment
3
4 . $CTDB_BASE/functions
5 loadconfig ctdb
6 loadconfig nfs
7
8 [ "$CTDB_MANAGES_NFS" = "yes" ] || exit 0
9 [ -z "$STATD_SHARED_DIRECTORY" ] && exit 0
10
11 cmd="$1"
12 shift
13
14 PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
15
16
17
18 case $cmd in 
19      startup)
20         /bin/mkdir -p $CTDB_BASE/state/nfs
21         /bin/mkdir -p $CTDB_BASE/state/statd/ip
22         /bin/mkdir -p $STATD_SHARED_DIRECTORY
23
24         /bin/rm -f $CTDB_BASE/state/statd/statd.restart >/dev/null 2>/dev/null
25
26         # make sure nfs is stopped before we start it, or it may get a bind error
27         startstop_nfs stop
28         startstop_nfs start
29         ;;
30
31      shutdown)
32         startstop_nfs stop
33         exit 0
34         ;;
35
36      takeip)
37         ip=$2
38
39         echo $ip >> $CTDB_BASE/state/statd/restart
40
41         # having a list of what IPs we have allows statd to do the right 
42         # thing via $CTDB_BASE/statd-callout
43         touch $CTDB_BASE/state/statd/ip/$ip
44         exit 0
45         ;;
46
47      releaseip)
48         iface=$1
49         ip=$2
50         maskbits=$3
51
52         echo $ip >> $CTDB_BASE/state/statd/restart
53         /bin/rm -f $CTDB_BASE/state/statd/ip/$ip
54         exit 0
55         ;;
56
57      recovered)
58         # if no IPs have changed then don't need to restart statd 
59         [ -f $CTDB_BASE/state/statd/restart ] || exit 0;
60
61         # always restart the lockmanager so that we start with a clusterwide
62         # graceperiod when ip addresses has changed
63         [ -x $CTDB_BASE/statd-callout ] && {
64                 $CTDB_BASE/statd-callout notify &
65         } >/dev/null 2>&1
66
67         /bin/rm -f $CTDB_BASE/state/statd/restart
68         ;;
69
70       monitor)
71         # check how many times in a row that nfsd has stopped responding
72         # after 3 times in a row we try to restart the full nfs service
73         NFSD_FAIL_COUNT_FILE=$CTDB_BASE/state/nfs/nfsd_fail_count
74         [ ! -f $NFSD_FAIL_COUNT_FILE ] && {
75                 echo 0 > $NFSD_FAIL_COUNT_FILE
76         }
77         NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
78         [ -z "$NFSD_FAIL_COUNT" ] && {
79                 echo 0 > $NFSD_FAIL_COUNT_FILE
80                 NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
81         }
82         # ok it has failed a few times too many. try restarting it.
83         [ `expr "$NFSD_FAIL_COUNT" ">" "3"` != "0" ] && {
84                 echo 60.nfs NFSD: trying to restart NFSD...
85                 echo 0 > $NFSD_FAIL_COUNT_FILE
86                 service nfs restart
87                 exit 0
88         }
89         expr "$NFSD_FAIL_COUNT" "+" "1" > $NFSD_FAIL_COUNT_FILE
90
91
92         # check that statd responds to rpc requests
93         # if statd is not running we try to restart it once and wait
94         # for the next monitoring event to verify if it is running or not
95         # if it still fails we fail and mark the node as UNHEALTHY
96         if [ -f $CTDB_BASE/state/statd/statd.restart ]; then
97                 # statd was restarted, see if it came up ok
98                 rpcinfo -u localhost 100024 1 > /dev/null || {
99                         echo "ERROR: Failed to restart STATD"
100                         exit 1
101                 }
102                 echo "STATD successfully restarted."
103                 /bin/rm -f $CTDB_BASE/state/statd/statd.restart
104         else
105                 rpcinfo -u localhost 100024 1 > /dev/null || {
106                         RPCSTATDOPTS=""
107                         [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
108                         [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
109                         [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
110                         rpc.statd $RPCSTATDOPTS 
111                         echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
112                         touch $CTDB_BASE/state/statd/statd.restart
113                 }
114         fi
115
116
117
118         # check that NFS responds to rpc requests
119         ctdb_check_rpc "NFS" 100003 3
120
121         # and that its directories are available
122         nfs_dirs=$(exportfs | grep -v '^#' | grep '^/' | awk {'print $1;'})
123         ctdb_check_directories "nfs" $nfs_dirs
124
125         # check that lockd responds to rpc requests
126         ctdb_check_rpc "lockd" 100021 1
127         ctdb_check_directories "statd" $STATD_SHARED_DIRECTORY
128
129         # mount needs special handling since it is sometimes not started
130         # correctly on RHEL5
131         rpcinfo -u localhost 100005 1 > /dev/null || {
132                 echo "ERROR: MOUNTD is not running. Trying to restart it."
133                 RPCMOUNTDOPTS=""
134                 [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
135                 killall -q -9 rpc.mountd
136                 rpc.mountd $RPCMOUNTDOPTS &
137                 exit 1
138         }
139         
140         # everything was ok with nfs so reset the fail count back to 0
141         echo 0 > $NFSD_FAIL_COUNT_FILE
142         ;;
143
144 esac
145
146 exit 0