2 # script to manage nfs in a clustered environment
8 [ "$CTDB_MANAGES_NFS" = "yes" ] || exit 0
9 [ -z "$STATD_SHARED_DIRECTORY" ] && exit 0
14 PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
20 /bin/mkdir -p $CTDB_BASE/state/nfs
21 /bin/mkdir -p $CTDB_BASE/state/statd/ip
22 /bin/mkdir -p $STATD_SHARED_DIRECTORY
24 /bin/rm -f $CTDB_BASE/state/statd/statd.restart >/dev/null 2>/dev/null
26 # make sure nfs is stopped before we start it, or it may get a bind error
39 echo $ip >> $CTDB_BASE/state/statd/restart
41 # having a list of what IPs we have allows statd to do the right
42 # thing via $CTDB_BASE/statd-callout
43 touch $CTDB_BASE/state/statd/ip/$ip
52 echo $ip >> $CTDB_BASE/state/statd/restart
53 /bin/rm -f $CTDB_BASE/state/statd/ip/$ip
58 # if no IPs have changed then don't need to restart statd
59 [ -f $CTDB_BASE/state/statd/restart ] || exit 0;
61 # always restart the lockmanager so that we start with a clusterwide
62 # graceperiod when ip addresses has changed
63 [ -x $CTDB_BASE/statd-callout ] && {
64 $CTDB_BASE/statd-callout notify &
67 /bin/rm -f $CTDB_BASE/state/statd/restart
71 # check how many times in a row that nfsd has stopped responding
72 # after 3 times in a row we try to restart the full nfs service
73 NFSD_FAIL_COUNT_FILE=$CTDB_BASE/state/nfs/nfsd_fail_count
74 [ ! -f $NFSD_FAIL_COUNT_FILE ] && {
75 echo 0 > $NFSD_FAIL_COUNT_FILE
77 NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
78 [ -z "$NFSD_FAIL_COUNT" ] && {
79 echo 0 > $NFSD_FAIL_COUNT_FILE
80 NFSD_FAIL_COUNT=`cat $NFSD_FAIL_COUNT_FILE`
82 # ok it has failed a few times too many. try restarting it.
83 [ `expr "$NFSD_FAIL_COUNT" ">" "3"` != "0" ] && {
84 echo 60.nfs NFSD: trying to restart NFSD...
85 echo 0 > $NFSD_FAIL_COUNT_FILE
89 expr "$NFSD_FAIL_COUNT" "+" "1" > $NFSD_FAIL_COUNT_FILE
92 # check that statd responds to rpc requests
93 # if statd is not running we try to restart it once and wait
94 # for the next monitoring event to verify if it is running or not
95 # if it still fails we fail and mark the node as UNHEALTHY
96 if [ -f $CTDB_BASE/state/statd/statd.restart ]; then
97 # statd was restarted, see if it came up ok
98 rpcinfo -u localhost 100024 1 > /dev/null || {
99 echo "ERROR: Failed to restart STATD"
102 echo "STATD successfully restarted."
103 /bin/rm -f $CTDB_BASE/state/statd/statd.restart
105 rpcinfo -u localhost 100024 1 > /dev/null || {
107 [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME"
108 [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT"
109 [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT"
110 rpc.statd $RPCSTATDOPTS
111 echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]"
112 touch $CTDB_BASE/state/statd/statd.restart
118 # check that NFS responds to rpc requests
119 ctdb_check_rpc "NFS" 100003 3
121 # and that its directories are available
122 nfs_dirs=$(exportfs | grep -v '^#' | grep '^/' | awk {'print $1;'})
123 ctdb_check_directories "nfs" $nfs_dirs
125 # check that lockd responds to rpc requests
126 ctdb_check_rpc "lockd" 100021 1
127 ctdb_check_directories "statd" $STATD_SHARED_DIRECTORY
129 # mount needs special handling since it is sometimes not started
131 rpcinfo -u localhost 100005 1 > /dev/null || {
132 echo "ERROR: MOUNTD is not running. Trying to restart it."
134 [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT"
135 killall -q -9 rpc.mountd
136 rpc.mountd $RPCMOUNTDOPTS &
140 # everything was ok with nfs so reset the fail count back to 0
141 echo 0 > $NFSD_FAIL_COUNT_FILE