add a new support function ctdb_check_counter_equal()
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Wed, 17 Nov 2010 02:50:56 +0000 (13:50 +1100)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Wed, 17 Nov 2010 02:50:56 +0000 (13:50 +1100)
update nfs to try to restart the service after 10 consecutive failures
and to flag the node unhealthy after 15

add similar function to mountd

config/events.d/60.nfs
config/functions

index 038adbb09409b34010f80219b1a211ec5b92e37e..8889cadcc26d1984d646d3072c871310f06bf778 100755 (executable)
@@ -78,15 +78,20 @@ case "$1" in
                (
                        service_name="nfs_knfsd"
                        ctdb_counter_incr
-                       ctdb_check_counter_limit 10 quiet >/dev/null
+
+                       ctdb_check_counter_equal 10 || {
+                               echo "Trying to restart NFS service"
+                               startstop_nfs restart >/dev/null 2>&1 &
+                               exit 0
+                       }
+
+                       ctdb_check_counter_limit 15 quiet >/dev/null
                ) || {
                        echo "$ctdb_check_rpc_out"
                        echo "Trying to restart NFS service"
                        startstop_nfs restart
                        exit 1
                }
-               # we haven't hit the failure limit so restart quietly
-               startstop_nfs restart >/dev/null 2>&1 &
            fi
        }
 
@@ -107,7 +112,25 @@ case "$1" in
 
        # mount needs special handling since it is sometimes not started
        # correctly on RHEL5
-       ctdb_check_rpc "MOUNTD" 100005 1 || {
+       if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
+               (service_name="nfs_mountd"; ctdb_counter_init)
+       else
+       (
+               service_name="nfs_mountd"
+               ctdb_counter_incr
+
+               ctdb_check_counter_equal 5 || {
+                       p="rpc.mountd"
+                       cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+                       echo "Trying to restart MOUNTD [${cmd}]"
+                       killall -q -9 $p
+                       $cmd &
+                       exit 0
+               }
+
+               ctdb_check_counter_limit 10 quiet >/dev/null
+       ) || {
+               echo "$ctdb_check_rpc_out"
                p="rpc.mountd"
                cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
                echo "Trying to restart MOUNTD [${cmd}]"
@@ -115,6 +138,9 @@ case "$1" in
                $cmd &
                exit 1
        }
+       fi
+
+
        # rquotad needs special handling since it is sometimes not started
        # correctly on RHEL5
        # this is not a critical service so we dont flag the node as unhealthy
index 610085b67700feb1faad5c715df79cb506ff4cde..4acfc4ffab3f4347dce18f23be6a2e1f0e8c4be9 100755 (executable)
@@ -571,6 +571,19 @@ ctdb_check_counter_limit () {
        echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet"
     fi
 }
+ctdb_check_counter_equal () {
+    _ctdb_counter_common
+
+    _limit=$1
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    if [ $_size -eq $_limit ] ; then
+       return 1
+    fi
+    return 0
+}
+
 ########################################################
 
 ctdb_spool_dir="/var/spool/ctdb"