update_tickles 2049
nfs_update_lock_info
- # check that statd responds to rpc requests
- # if statd is not running we try to restart it
- # we only do this IF we have a rpc.statd command.
- # For platforms where rpc.statd does not exist, we skip
- # the check completely
- p="rpc.statd"
- type $p >/dev/null 2>/dev/null && \
- nfs_check_rpc_service "statd" \
- -ge 6 "verbose restart:b unhealthy" \
- % 2 "verbose restart:b"
+ nfs_check_service "status" <<EOF
+version="1" # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
+EOF
if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then
monitor_ganesha_nfsd
fi
# rquotad is sometimes not started correctly on RHEL5
- nfs_check_rpc_service "rquotad" \
- -ge 6 "verbose restart:b unhealthy" \
- % 2 "verbose restart:b"
+ nfs_check_service "rquotad" <<EOF
+version="1" # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
+EOF
;;
*)
update_tickles 2049
nfs_update_lock_info
- nfs_check_rpc_services
+ nfs_check_services
nfs_check_thread_count
;;
nfs_dump_some_threads
service nfsserver start
;;
+ restart-stop)
+ set_proc "fs/nfsd/threads" 0
+ service nfsserver stop > /dev/null 2>&1
+ pkill -9 nfsd
+ ;;
esac
;;
rhel)
service nfslock start
service nfs start
;;
+ restart-stop)
+ set_proc "fs/nfsd/threads" 0
+ service nfs stop > /dev/null 2>&1
+ service nfslock stop > /dev/null 2>&1
+ pkill -9 nfsd
+ ;;
esac
;;
*)
--- /dev/null
+# status
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
--- /dev/null
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="startstop_nfs restart-stop"
+service_start_cmd="startstop_nfs start"
+service_debug_cmd="program_stack_traces nfsd 5"
--- /dev/null
+# nlockmgr
+version="4"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="startstop_nfslock stop"
+service_start_cmd="startstop_nfslock start"
--- /dev/null
+# mountd
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.mountd"
+service_start_cmd="rpc.mountd $RPCMOUNTDOPTS ${MOUNTD_PORT:+-p} $MOUNTD_PORT"
+service_debug_cmd="program_stack_traces rpc.mountd 5"
--- /dev/null
+# rquotad
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
--- /dev/null
+NFS check configuration files.
+
+Files are named NN.RPCSERVICE.check. Files without a .check suffix
+are ignored.
+
+Supported variables are:
+
+* family - "tcp" or "udp" or space separated list
+ default: tcp
+* version - optional, RPC service version number
+ default is to omit to check for any version
+* unhealthy_after - number of check fails before unhealthy
+ default: 1
+* restart_every - number of check fails before restart
+ default: 0, meaning no restart
+* service_stop_cmd - command to stop service
+ default: no default, must be provided if
+ restart_every > 0
+* service_start_cmd - command to start service
+ default: no default, must be provided if
+ restart_every > 0
+* service_debug_cmd - command to debug a service after trying to stop it;
+ for example, it can be useful to print stack
+ traces of threads that have not exited, since
+ they may be stuck doing I/O;
+ no default, see also function program_stack_traces()
+
+Quoting inside values is not preserved.
# This is a hack. All documents should be installed in /usr/share/doc.
rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/events.d/README
cp config/events.d/README README.eventscripts
+rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/nfs-checks.d/README
+cp config/nfs-checks.d/README README.nfs-checks.d
cp config/notify.d.README README.notify.d
# Remove "*.old" files
%{_sysconfdir}/ctdb/events.d/70.iscsi
%{_sysconfdir}/ctdb/events.d/91.lvs
%{_sysconfdir}/ctdb/events.d/99.timeout
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/10.statd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/20.nfsd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/30.lockd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/40.mountd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/50.rquotad.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/10.status.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/20.nfs.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/30.nlockmgr.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/40.mountd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/50.rquotad.check
%{_sysconfdir}/ctdb/statd-callout
%{_sbindir}/ctdbd
%{_sbindir}/ctdbd_wrapper
--- /dev/null
+../../../config/nfs-checks.d
\ No newline at end of file
done
}
-mark_background ()
-{
- sed -e 's@^@\&@'
-}
-
-convert_progname ()
+guess_output ()
{
case "$1" in
- nfs) echo "nfsd" ;;
- nlockmgr) echo "lockd" ;;
- status) echo "statd" ;;
- *) echo "$1" ;;
+ startstop_nfslock\ start)
+ echo "&Starting nfslock: OK"
+ ;;
+ startstop_nfs\ start)
+ cat <<EOF
+&Starting nfslock: OK
+&Starting nfs: OK
+EOF
+ ;;
+ *)
+ : # Nothing
esac
}
-
# Set the required result for a particular RPC program having failed
# for a certain number of iterations. This is probably still a work
# in progress. Note that we could hook aggressively
rpc_set_service_failure_response ()
{
_rpc_service="$1"
- # The number of failures defaults to the iteration number. This
- # will be true when we fail from the 1st iteration... but we need
- # the flexibility to set the number of failures.
- _numfails="${2:-${iteration:-1}}"
+ _numfails="${2:-1}" # default 1
- _progname=$(convert_progname "$_rpc_service")
+ # Default
+ ok_null
+ if [ $_numfails -eq 0 ] ; then
+ return
+ fi
nfs_load_config
_nl="
"
- # Default
- ok_null
+ _dir="${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}"
- _file=$(ls "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9]."${_progname}.check")
+ _file=$(ls "$_dir"/[0-9][0-9]."${_rpc_service}.check")
[ -r "$_file" ] || die "RPC check file \"$_file\" does not exist or is not unique"
- while read _op _li _actions ; do
- # Skip comments
- case "$_op" in
- \#*) continue ;;
- esac
+ _out=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
+ _rc_file=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
- _hit=false
- if [ "$_op" != "%" ] ; then
- if [ $_numfails $_op $_li ] ; then
- _hit=true
- fi
+ (
+ # Subshell to restrict scope variables...
+
+ # Defaults
+ family="tcp"
+ version=""
+ unhealthy_after=1
+ restart_every=0
+ service_stop_cmd=""
+ service_start_cmd=""
+ service_debug_cmd=""
+
+ # Don't bother syntax checking, eventscript does that...
+ . "$_file"
+
+ # Just use the first version, default to 1. This is dumb but
+ # handles all the cases that we care about now...
+ if [ -n "$version" ] ; then
+ _ver="${version%% *}"
else
- if [ $_numfails -gt 0 -a $(($_numfails $_op $_li)) -eq 0 ] ; then
- _hit=true
- fi
+ _ver=1
fi
- if $_hit ; then
- _out=""
- _rc=0
- for _action in $_actions ; do
- case "$_action" in
- verbose)
- _ver=1
- case "$_rpc_service" in
- nfs) _ver=3 ;;
- nlockmgr) _ver=4 ;;
- esac
- _out="\
-ERROR: $_rpc_service failed RPC check:
+ _rpc_check_out="\
+$_rpc_service failed RPC check:
rpcinfo: RPC: Program not registered
program $_rpc_service version $_ver is not available"
- ;;
- restart*)
- _p="rpc.${_progname}"
- case "$_action" in
- *:b) _bg=mark_background ;;
- *) _bg=cat ;;
- esac
- case "$_progname" in
- nfsd)
- _t=$(program_stack_traces "nfsd" 5)
- _t="${_t}${_t:+${_nl}}Starting nfslock: OK
-Starting nfs: OK"
- _t=$(echo "$_t" | $_bg)
- _t="\
-Trying to restart NFS service
-${_t}"
- ;;
- lockd)
- _t=$(echo "Starting nfslock: OK" | $_bg)
- _t="Trying to restart lock manager service${_t:+${_nl}}${_t}"
- ;;
- *)
- _t="Trying to restart $_progname [${_p}]"
- _stacks=$(program_stack_traces "$_p" 5)
- _t="${_t}${_stacks:+${_nl}}${_stacks}"
- esac
- _out="${_out}${_out:+${_nl}}${_t}"
- ;;
- unhealthy)
- _rc=1
- esac
- done
- required_result $_rc "$_out"
- return
+
+ if [ $unhealthy_after -gt 0 -a $_numfails -ge $unhealthy_after ] ; then
+ _unhealthy=true
+ echo 1 >"$_rc_file"
+ echo "ERROR: ${_rpc_check_out}" >>"$_out"
+ else
+ _unhealthy=false
+ echo 0 >"$_rc_file"
fi
- done <"$_file"
+
+ if [ $restart_every -gt 0 -a $(($_numfails % $restart_every)) -eq 0 ] ; then
+ if ! $_unhealthy ; then
+ echo "WARNING: ${_rpc_check_out}" >>"$_out"
+ fi
+
+ echo "Trying to restart service \"${_rpc_service}\"..." >>"$_out"
+
+ if [ -n "$service_debug_cmd" ] ; then
+ $service_debug_cmd 2>&1 >>"$_out"
+ fi
+
+ guess_output "$service_start_cmd" >>"$_out"
+ fi
+ )
+
+ read _rc <"$_rc_file"
+ required_result $_rc <"$_out"
+
+ rm -f "$_out" "$_rc_file"
}
######################################################################
etc_subdirs = [
'events.d',
- 'nfs-rpc-checks.d'
+ 'nfs-checks.d'
]
if bld.env.standalone_ctdb:
test_eventscript_links = [
'events.d',
'functions',
- 'nfs-rpc-checks.d',
+ 'nfs-checks.d',
'statd-callout'
]