ctdb-scripts: Switch NFS checks to new style
authorMartin Schwenke <martin@meltin.net>
Fri, 19 Jun 2015 06:35:12 +0000 (16:35 +1000)
committerAmitay Isaacs <amitay@samba.org>
Tue, 14 Jul 2015 07:57:18 +0000 (09:57 +0200)
Note that the 60.ganesha RPC checks need to be identical to those in
the nfs-checks.d/ directory.  This is because the NFS unit test
infrastructure checks output against what should be produced by the
checks in nfs-checks.d/.  This is a minor issue, since one of the aims
of this work is to remove the need for a separate 60.ganesha.

In most cases configuration variable CTDB_NFS_DUMP_STUCK_THREADS is
now ignored.  This is now handled by passing the desired number of
threads to the command specified in the service_debug_cmd variable in
a .check file.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
13 files changed:
ctdb/config/events.d/60.ganesha
ctdb/config/events.d/60.nfs
ctdb/config/functions
ctdb/config/nfs-checks.d/10.status.check [new file with mode: 0644]
ctdb/config/nfs-checks.d/20.nfs.check [new file with mode: 0644]
ctdb/config/nfs-checks.d/30.nlockmgr.check [new file with mode: 0644]
ctdb/config/nfs-checks.d/40.mountd.check [new file with mode: 0644]
ctdb/config/nfs-checks.d/50.rquotad.check [new file with mode: 0644]
ctdb/config/nfs-checks.d/README [new file with mode: 0644]
ctdb/packaging/RPM/ctdb.spec.in
ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d [new symlink]
ctdb/tests/eventscripts/scripts/local.sh
ctdb/wscript

index 43c70df1c6d0214f3c95256f2e8ce8fc5f416447..2524fd472fb391dac56e28ff5405626d289713c2 100755 (executable)
@@ -222,25 +222,28 @@ case "$1" in
        update_tickles 2049
        nfs_update_lock_info
 
-       # check that statd responds to rpc requests
-       # if statd is not running we try to restart it
-       # we only do this IF we have a rpc.statd command.
-       # For platforms where rpc.statd does not exist, we skip
-        # the check completely
-       p="rpc.statd"
-       type $p >/dev/null 2>/dev/null && \
-           nfs_check_rpc_service "statd" \
-               -ge 6 "verbose restart:b unhealthy" \
-               %   2 "verbose restart:b"
+       nfs_check_service "status" <<EOF
+version="1"  # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
+EOF
 
        if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then
            monitor_ganesha_nfsd
        fi
 
        # rquotad is sometimes not started correctly on RHEL5
-       nfs_check_rpc_service "rquotad" \
-           -ge 6 "verbose restart:b unhealthy" \
-           %   2 "verbose restart:b"
+       nfs_check_service "rquotad" <<EOF
+version="1"  # could drop this and use any version?
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
+EOF
        ;;
 
      *)
index babff1e53302d58ed1099f88aacb5699c254bc9c..997d676091bfd7333b002f4687a875ab01e9e470 100755 (executable)
@@ -92,7 +92,7 @@ case "$1" in
        update_tickles 2049
        nfs_update_lock_info
 
-       nfs_check_rpc_services
+       nfs_check_services
 
        nfs_check_thread_count
                ;;
index 0b0021c79b4f93284b25b09963383d81f57de803..4290bfa841ff5fde50fa415e33101cdd22c38ce5 100755 (executable)
@@ -943,6 +943,11 @@ startstop_nfs() {
                        nfs_dump_some_threads
                        service nfsserver start
                        ;;
+               restart-stop)
+                       set_proc "fs/nfsd/threads" 0
+                       service nfsserver stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       ;;
                esac
                ;;
        rhel)
@@ -964,6 +969,12 @@ startstop_nfs() {
                        service nfslock start
                        service nfs start
                        ;;
+               restart-stop)
+                       set_proc "fs/nfsd/threads" 0
+                       service nfs stop > /dev/null 2>&1
+                       service nfslock stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       ;;
                esac
                ;;
        *)
diff --git a/ctdb/config/nfs-checks.d/10.status.check b/ctdb/config/nfs-checks.d/10.status.check
new file mode 100644 (file)
index 0000000..dfa5c59
--- /dev/null
@@ -0,0 +1,7 @@
+# status
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.statd"
+service_start_cmd="rpc.statd ${STATD_HA_CALLOUT:+-H} $STATD_HA_CALLOUT ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME ${STATD_PORT:+-p} $STATD_PORT ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT"
+service_debug_cmd="program_stack_traces rpc.statd 5"
diff --git a/ctdb/config/nfs-checks.d/20.nfs.check b/ctdb/config/nfs-checks.d/20.nfs.check
new file mode 100644 (file)
index 0000000..7229f7d
--- /dev/null
@@ -0,0 +1,7 @@
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="startstop_nfs restart-stop"
+service_start_cmd="startstop_nfs start"
+service_debug_cmd="program_stack_traces nfsd 5"
diff --git a/ctdb/config/nfs-checks.d/30.nlockmgr.check b/ctdb/config/nfs-checks.d/30.nlockmgr.check
new file mode 100644 (file)
index 0000000..c2e723e
--- /dev/null
@@ -0,0 +1,6 @@
+# nlockmgr
+version="4"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="startstop_nfslock stop"
+service_start_cmd="startstop_nfslock start"
diff --git a/ctdb/config/nfs-checks.d/40.mountd.check b/ctdb/config/nfs-checks.d/40.mountd.check
new file mode 100644 (file)
index 0000000..56b3fd2
--- /dev/null
@@ -0,0 +1,7 @@
+# mountd
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.mountd"
+service_start_cmd="rpc.mountd $RPCMOUNTDOPTS ${MOUNTD_PORT:+-p} $MOUNTD_PORT"
+service_debug_cmd="program_stack_traces rpc.mountd 5"
diff --git a/ctdb/config/nfs-checks.d/50.rquotad.check b/ctdb/config/nfs-checks.d/50.rquotad.check
new file mode 100644 (file)
index 0000000..b7bd9d2
--- /dev/null
@@ -0,0 +1,7 @@
+# rquotad
+version="1"
+restart_every=2
+unhealthy_after=6
+service_stop_cmd="killall -q -9 rpc.rquotad"
+service_start_cmd="rpc.rquotad ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT"
+service_debug_cmd="program_stack_traces rpc.rquotad 5"
diff --git a/ctdb/config/nfs-checks.d/README b/ctdb/config/nfs-checks.d/README
new file mode 100644 (file)
index 0000000..51ba54b
--- /dev/null
@@ -0,0 +1,28 @@
+NFS check configuration files.
+
+Files are named NN.RPCSERVICE.check.  Files without a .check suffix
+are ignored.
+
+Supported variables are:
+
+* family             - "tcp" or "udp" or space separated list
+                       default: tcp
+* version            - optional, RPC service version number
+                       default is to omit to check for any version
+* unhealthy_after    - number of check fails before unhealthy
+                       default: 1
+* restart_every      - number of check fails before restart
+                       default: 0, meaning no restart
+* service_stop_cmd   - command to stop service
+                       default: no default, must be provided if
+                                restart_every > 0
+* service_start_cmd  - command to start service
+                       default: no default, must be provided if
+                                restart_every > 0
+* service_debug_cmd  - command to debug a service after trying to stop it;
+                       for example, it can be useful to print stack
+                       traces of threads that have not exited, since
+                       they may be stuck doing I/O;
+                       no default, see also function program_stack_traces()
+
+Quoting inside values is not preserved.
index ce7d8a629e952aa7905dfeb2bb86b054cc4c61db..503670023b8cc16884dbb4ad046339e375e3ba57 100644 (file)
@@ -125,6 +125,8 @@ install -m755 config/ctdb.init $RPM_BUILD_ROOT%{initdir}/ctdb
 # This is a hack. All documents should be installed in /usr/share/doc.
 rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/events.d/README
 cp config/events.d/README README.eventscripts
+rm -f $RPM_BUILD_ROOT%{_sysconfdir}/ctdb/nfs-checks.d/README
+cp config/nfs-checks.d/README README.nfs-checks.d
 cp config/notify.d.README README.notify.d
 
 # Remove "*.old" files
@@ -183,11 +185,11 @@ rm -rf $RPM_BUILD_ROOT
 %{_sysconfdir}/ctdb/events.d/70.iscsi
 %{_sysconfdir}/ctdb/events.d/91.lvs
 %{_sysconfdir}/ctdb/events.d/99.timeout
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/10.statd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/20.nfsd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/30.lockd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/40.mountd.check
-%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/50.rquotad.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/10.status.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/20.nfs.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/30.nlockmgr.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/40.mountd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-checks.d/50.rquotad.check
 %{_sysconfdir}/ctdb/statd-callout
 %{_sbindir}/ctdbd
 %{_sbindir}/ctdbd_wrapper
diff --git a/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d b/ctdb/tests/eventscripts/etc-ctdb/nfs-checks.d
new file mode 120000 (symlink)
index 0000000..3dc2161
--- /dev/null
@@ -0,0 +1 @@
+../../../config/nfs-checks.d
\ No newline at end of file
index 69b0d477570c11ac54aff175b638a1d1c66bd596..b13399e97c3e3e55febfde127f5356662cabd205 100644 (file)
@@ -893,22 +893,23 @@ EOF
     done
 }
 
-mark_background ()
-{
-    sed -e 's@^@\&@'
-}
-
-convert_progname ()
+guess_output ()
 {
     case "$1" in
-       nfs)      echo "nfsd" ;;
-       nlockmgr) echo "lockd" ;;
-       status)   echo "statd" ;;
-       *)        echo "$1" ;;
+       startstop_nfslock\ start)
+           echo "&Starting nfslock: OK"
+           ;;
+       startstop_nfs\ start)
+           cat <<EOF
+&Starting nfslock: OK
+&Starting nfs: OK
+EOF
+           ;;
+       *)
+           : # Nothing
     esac
 }
 
-
 # Set the required result for a particular RPC program having failed
 # for a certain number of iterations.  This is probably still a work
 # in progress.  Note that we could hook aggressively
@@ -920,12 +921,13 @@ convert_progname ()
 rpc_set_service_failure_response ()
 {
     _rpc_service="$1"
-    # The number of failures defaults to the iteration number.  This
-    # will be true when we fail from the 1st iteration... but we need
-    # the flexibility to set the number of failures.
-    _numfails="${2:-${iteration:-1}}"
+    _numfails="${2:-1}" # default 1
 
-    _progname=$(convert_progname "$_rpc_service")
+    # Default
+    ok_null
+    if [ $_numfails -eq 0 ] ; then
+       return
+    fi
 
     nfs_load_config
 
@@ -933,79 +935,69 @@ rpc_set_service_failure_response ()
     _nl="
 "
 
-    # Default
-    ok_null
+    _dir="${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}"
 
-    _file=$(ls "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9]."${_progname}.check")
+    _file=$(ls "$_dir"/[0-9][0-9]."${_rpc_service}.check")
     [ -r "$_file" ] || die "RPC check file \"$_file\" does not exist or is not unique"
 
-    while read _op _li _actions ; do
-       # Skip comments
-       case "$_op" in
-           \#*) continue ;;
-       esac
+    _out=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
+    _rc_file=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
 
-       _hit=false
-       if [ "$_op" != "%" ] ; then
-           if [ $_numfails $_op $_li ] ; then
-               _hit=true
-           fi
+    (
+       # Subshell to restrict scope variables...
+
+       # Defaults
+       family="tcp"
+       version=""
+       unhealthy_after=1
+       restart_every=0
+       service_stop_cmd=""
+       service_start_cmd=""
+       service_debug_cmd=""
+
+       # Don't bother syntax checking, eventscript does that...
+       . "$_file"
+
+       # Just use the first version, default to 1.  This is dumb but
+       # handles all the cases that we care about now...
+       if [ -n "$version" ] ; then
+           _ver="${version%% *}"
        else
-           if [ $_numfails -gt 0 -a $(($_numfails $_op $_li)) -eq 0 ] ; then
-               _hit=true
-           fi
+           _ver=1
        fi
-       if $_hit ; then
-           _out=""
-           _rc=0
-           for _action in $_actions ; do
-               case "$_action" in
-                   verbose)
-                       _ver=1
-                       case "$_rpc_service" in
-                           nfs)      _ver=3 ;;
-                           nlockmgr) _ver=4 ;;
-                       esac
-                       _out="\
-ERROR: $_rpc_service failed RPC check:
+       _rpc_check_out="\
+$_rpc_service failed RPC check:
 rpcinfo: RPC: Program not registered
 program $_rpc_service version $_ver is not available"
-                       ;;
-                   restart*)
-                       _p="rpc.${_progname}"
-                       case "$_action" in
-                           *:b) _bg=mark_background ;;
-                           *)   _bg=cat  ;;
-                       esac
-                       case "$_progname" in
-                           nfsd)
-                               _t=$(program_stack_traces "nfsd" 5)
-                               _t="${_t}${_t:+${_nl}}Starting nfslock: OK
-Starting nfs: OK"
-                               _t=$(echo "$_t" | $_bg)
-                               _t="\
-Trying to restart NFS service
-${_t}"
-                               ;;
-                           lockd)
-                               _t=$(echo "Starting nfslock: OK" | $_bg)
-                               _t="Trying to restart lock manager service${_t:+${_nl}}${_t}"
-                               ;;
-                           *)
-                               _t="Trying to restart $_progname [${_p}]"
-                               _stacks=$(program_stack_traces "$_p" 5)
-                               _t="${_t}${_stacks:+${_nl}}${_stacks}"
-                       esac
-                       _out="${_out}${_out:+${_nl}}${_t}"
-                       ;;
-                   unhealthy)
-                       _rc=1
-               esac
-           done
-           required_result $_rc "$_out"
-           return
+
+       if [ $unhealthy_after -gt 0 -a $_numfails -ge $unhealthy_after ] ; then
+           _unhealthy=true
+           echo 1 >"$_rc_file"
+           echo "ERROR: ${_rpc_check_out}" >>"$_out"
+       else
+           _unhealthy=false
+           echo 0 >"$_rc_file"
        fi
-    done <"$_file"
+
+       if [ $restart_every -gt 0 -a $(($_numfails % $restart_every)) -eq 0 ] ; then
+           if ! $_unhealthy ; then
+               echo "WARNING: ${_rpc_check_out}" >>"$_out"
+           fi
+
+           echo "Trying to restart service \"${_rpc_service}\"..." >>"$_out"
+
+           if [ -n "$service_debug_cmd" ] ; then
+               $service_debug_cmd 2>&1 >>"$_out"
+           fi
+
+           guess_output "$service_start_cmd" >>"$_out"
+       fi
+    )
+
+    read _rc <"$_rc_file"
+    required_result $_rc <"$_out"
+
+    rm -f "$_out" "$_rc_file"
 }
 
 ######################################################################
index add10ec0e78597a375ef288ac4647fe64e96947b..7b3304b10a8e7c599796d3f03fca0a2368d94cfa 100755 (executable)
@@ -464,7 +464,7 @@ def build(bld):
 
     etc_subdirs = [
         'events.d',
-        'nfs-rpc-checks.d'
+        'nfs-checks.d'
     ]
 
     if bld.env.standalone_ctdb:
@@ -627,7 +627,7 @@ def build(bld):
     test_eventscript_links = [
         'events.d',
         'functions',
-        'nfs-rpc-checks.d',
+        'nfs-checks.d',
         'statd-callout'
     ]