eventscripts: New configuration varable $CTDB_NFS_DUMP_STUCK_THREADS

author Martin Schwenke <martin@meltin.net>

Thu, 13 Jun 2013 01:56:25 +0000 (11:56 +1000)

committer Amitay Isaacs <amitay@gmail.com>

Thu, 20 Jun 2013 02:56:29 +0000 (12:56 +1000)
author Martin Schwenke <martin@meltin.net>
Thu, 13 Jun 2013 01:56:25 +0000 (11:56 +1000)
committer Amitay Isaacs <amitay@gmail.com>
Thu, 20 Jun 2013 02:56:29 +0000 (12:56 +1000)
diff --git a/config/ctdb.sysconfig b/config/ctdb.sysconfig

index 5ba1f7205d737c49962dd909a5dfb741f5d366a2..c0d6f0a19566f760b71192b9688fe756f1e962f3 100644 (file)
--- a/config/ctdb.sysconfig
+++ b/config/ctdb.sysconfig
@@ -142,6 +142,12 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
  # to not do this check.
  # CTDB_MONITOR_NFS_THREAD_COUNT="yes"
  
+
+# The number of nfsd threads to dump stack traces for if some are
+# still alive after stopping NFS during a restart.  The default is to
+# dump no stack traces.
+# CTDB_NFS_DUMP_STUCK_THREADS=5
+
  # When set to yes, the CTDB node will start in DISABLED mode and not host
  # any public ip addresses. The administrator needs to explicitely enable
  # the node with "ctdb enable"
diff --git a/config/functions b/config/functions

index 4d828015a9d0b3f937b6d50b6f5b0e620fe49d75..b35f60fdb8fc5fa8d3b127ca9bdfc76961dc2260 100755 (executable)
--- a/config/functions
+++ b/config/functions
@@ -564,6 +564,7 @@ startstop_nfs() {
                         echo 0 >/proc/fs/nfsd/threads
                         service nfsserver stop > /dev/null 2>&1
                         pkill -9 nfsd
+                       nfs_dump_some_threads
                         service nfsserver start
                         ;;
                 esac
@@ -583,6 +584,7 @@ startstop_nfs() {
                         service nfs stop > /dev/null 2>&1
                         service nfslock stop > /dev/null 2>&1
                         pkill -9 nfsd
+                       nfs_dump_some_threads
                         service nfslock start
                         service nfs start
                         ;;
@@ -595,6 +597,28 @@ startstop_nfs() {
         esac
  }
  
+# Dump up to the configured number of nfsd thread backtraces.
+nfs_dump_some_threads ()
+{
+    [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0
+
+    # Optimisation to avoid running an unnecessary pidof
+    [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
+
+    _count=0
+    for _pid in $(pidof nfsd) ; do
+       [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
+
+       # Do this first to avoid racing with thread exit
+       _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
+       if [ -n "$_stack" ] ; then
+           echo "Stack trace for stuck nfsd thread [${_pid}]:"
+           echo "$_stack"
+           _count=$(($_count + 1))
+       fi
+    done
+}
+
  ########################################################
  # start/stop the nfs lockmanager service on different platforms
  ########################################################
author	Martin Schwenke <martin@meltin.net>
	Thu, 13 Jun 2013 01:56:25 +0000 (11:56 +1000)
committer	Amitay Isaacs <amitay@gmail.com>
	Thu, 20 Jun 2013 02:56:29 +0000 (12:56 +1000)
config/ctdb.sysconfig		patch \| blob \| history
config/functions		patch \| blob \| history