eventscripts: Add optional counter name argument to some counter functions
[ctdb.git] / config / functions
index 35b7db2e61a8eda41c08f6929aaa25db461ac29c..2acac25510fba976734fd4e83bdc5c246ce1b99f 100755 (executable)
@@ -4,7 +4,7 @@ PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
 
 #######################################
 # pull in a system config file, if any
-loadconfig() {
+_loadconfig() {
 
     if [ -z "$1" ] ; then
        foo="${service_config:-${service_name}}"
@@ -25,6 +25,30 @@ loadconfig() {
     fi
 }
 
+loadconfig () {
+    _loadconfig "$@"
+}
+
+##############################################################
+# check number of args for different events
+ctdb_check_args ()
+{
+    case "$1" in
+       takeip|releaseip)
+           if [ $# != 4 ]; then
+               echo "ERROR: must supply interface, IP and maskbits"
+               exit 1
+           fi
+           ;;
+       updateip)
+           if [ $# != 5 ]; then
+               echo "ERROR: must supply old interface, new interface, IP and maskbits"
+               exit 1
+           fi
+           ;;
+    esac
+}
+
 ##############################################################
 # determine on what type of system (init style) we are running
 detect_init_style() {
@@ -135,6 +159,24 @@ ctdb_wait_tcp_ports() {
 }
 
 
+######################################################
+# wrapper around /proc/ settings to allow them to be hooked
+# for testing
+# 1st arg is relative path under /proc/, 2nd arg is value to set
+set_proc ()
+{
+    echo "$2" >"/proc/$1"
+}
+
+######################################################
+# wrapper around getting file contents from /proc/ to allow
+# this to be hooked for testing
+# 1st arg is relative path under /proc/
+get_proc ()
+{
+    cat "/proc/$1"
+}
+
 ######################################################
 # check that a rpc server is registered with portmap
 # and responding to requests
@@ -144,10 +186,14 @@ ctdb_check_rpc() {
     progname="$1"
     prognum="$2"
     version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-           echo "ERROR: $progname not responding to rpc requests"
-           exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+       ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+       echo "$ctdb_check_rpc_out"
+       return 1
+    fi
 }
 
 ######################################################
@@ -183,16 +229,164 @@ ctdb_check_directories() {
 # check a set of tcp ports
 # usage: ctdb_check_tcp_ports <ports...>
 ######################################################
-ctdb_check_tcp_ports() {
 
-    for p ; do
-       if ! netstat -a -t -n | grep -q "0\.0\.0\.0:$p .*LISTEN" ; then
-            if ! netstat -a -t -n | grep -q ":::$p .*LISTEN" ; then
-               echo "ERROR: $service_name tcp port $p is not responding"
+# This flag file is created when a service is initially started.  It
+# is deleted the first time TCP port checks for that service succeed.
+# Until then ctdb_check_tcp_ports() prints a more subtle "error"
+# message if a port check fails.
+_ctdb_check_tcp_common ()
+{
+    _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
+}
+
+ctdb_check_tcp_init ()
+{
+    _ctdb_check_tcp_common
+    mkdir -p "${_ctdb_service_started_file%/*}" # dirname
+    touch "$_ctdb_service_started_file"
+}
+
+ctdb_check_tcp_ports()
+{
+    if [ -z "$1" ] ; then
+       echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
+       exit 1
+    fi
+
+    # Set default value for CTDB_TCP_PORT_CHECKS if unset.
+    # If any of these defaults are unsupported then this variable can
+    # be overridden in /etc/sysconfig/ctdb or via a file in
+    # /etc/ctdb/rc.local.d/.
+    : ${CTDB_TCP_PORT_CHECKERS:=ctdb nmap netstat}
+
+    for _c in $CTDB_TCP_PORT_CHECKERS ; do
+       ctdb_check_tcp_ports_$_c "$@"
+       case "$?" in
+           0)
+               _ctdb_check_tcp_common
+               rm -f "$_ctdb_service_started_file"
+               return 0
+               ;;
+           1)
+               _ctdb_check_tcp_common
+               if [ ! -f "$_ctdb_service_started_file" ] ; then
+                   echo "ERROR: $service_name tcp port $_p is not responding"
+                   cat <<EOF
+$ctdb_check_tcp_ports_debug
+EOF
+               else
+                   echo "INFO: $service_name tcp port $_p is not responding"
+               fi
+
                return 1
-            fi
-       fi
+               ;;
+           127)
+               : # Not implemented
+               ;;
+           *)
+               
+       esac
+    done
+
+    echo "INTERNAL ERROR: ctdb_check_ports - no working checkers in CTDB_TCP_PORT_CHECKERS=\"$CTDB_TCP_PORT_CHECKERS\""
+
+    return 127
+}
+
+ctdb_check_tcp_ports_netstat ()
+{
+    _cmd='netstat -l -t -n'
+    _ns=$($_cmd 2>&1)
+    if [ $? -eq 127 ] ; then
+       # netstat probably not installed - unlikely?
+       ctdb_check_tcp_ports_debug="$_ns"
+       return 127
+    fi
+
+    for _p ; do  # process each function argument (port)
+       for _a in '0\.0\.0\.0' '::' ; do
+           _pat="[[:space:]]${_a}:${_p}[[:space:]]+[^[:space:]]+[[:space:]]+LISTEN"
+           if echo "$_ns" | grep -E -q "$_pat" ; then
+               # We matched the port, so process next port
+               continue 2
+           fi
+       done
+
+       # We didn't match the port, so flag an error.
+       ctdb_check_tcp_ports_debug="$_cmd shows this output:
+$_ns"
+       return 1
     done
+
+    return 0
+}
+
+ctdb_check_tcp_ports_nmap ()
+{
+    # nmap wants a comma-separated list of ports
+    _ports=""
+    for _p ; do
+       _ports="${_ports}${_ports:+,}${_p}"
+    done
+
+    _cmd="nmap -n -oG - -PS 127.0.0.1 -p $_ports"
+
+    _nmap_out=$($_cmd 2>&1)
+    if [ $? -eq 127 ] ; then
+       # nmap probably not installed
+       ctdb_check_tcp_ports_debug="$_nmap_out"
+       return 127
+    fi
+
+    # get the port-related output
+    _port_info=$(echo "$_nmap_out" | sed -n -r -e 's@^.*Ports:[[:space:]]@@p')
+
+    for _p ; do
+       # looking for something like this:
+       #  445/open/tcp//microsoft-ds///
+       # possibly followed by a comma
+       _t="$_p/open/tcp//"
+       case "$_port_info" in
+           # The info we're after must be either at the beginning of
+           # the string or it must follow a space.
+            $_t*|*\ $_t*) : ;;
+           *)
+               # Nope, flag an error...
+               ctdb_check_tcp_ports_debug="$_cmd shows this output:
+$_nmap_out"
+               return 1
+       esac
+    done
+
+    return 0
+}
+
+# Use the new "ctdb checktcpport" command to check the port.
+# This is very cheap.
+ctdb_check_tcp_ports_ctdb ()
+{
+    for _p ; do  # process each function argument (port)
+       _cmd="ctdb checktcpport $_p"
+       _out=$($_cmd 2>&1)
+       _ret=$?
+       case "$_ret" in
+           0)
+               ctdb_check_tcp_ports_debug="\"$_cmd\" was able to bind to port"
+               return 1
+               ;;
+           98)
+               # Couldn't bind, something already listening, next port...
+               continue
+               ;;
+           *)
+               ctdb_check_tcp_ports_debug="$_cmd (exited with $_ret) with output:
+$_out"
+               # assume not implemented
+               return 127
+       esac
+    done
+
+    return 0
 }
 
 ######################################################
@@ -231,7 +425,7 @@ kill_tcp_connections() {
     _failed=0
 
     _killcount=0
-    connfile="$CTDB_BASE/state/connections.$_IP"
+    connfile="$CTDB_VARDIR/state/connections.$_IP"
     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 
@@ -281,7 +475,7 @@ kill_tcp_connections_local_only() {
     _failed=0
 
     _killcount=0
-    connfile="$CTDB_BASE/state/connections.$_IP"
+    connfile="$CTDB_VARDIR/state/connections.$_IP"
     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 
@@ -323,7 +517,7 @@ tickle_tcp_connections() {
     _failed=0
 
     _killcount=0
-    connfile="$CTDB_BASE/state/connections.$_IP"
+    connfile="$CTDB_VARDIR/state/connections.$_IP"
     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 
@@ -367,7 +561,11 @@ startstop_nfs() {
                        service nfsserver stop > /dev/null 2>&1
                        ;;
                restart)
-                       service nfsserver restart
+                       echo 0 >/proc/fs/nfsd/threads
+                       service nfsserver stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       nfs_dump_some_threads
+                       service nfsserver start
                        ;;
                esac
                ;;
@@ -382,8 +580,13 @@ startstop_nfs() {
                        service nfslock stop > /dev/null 2>&1
                        ;;
                restart)
-                       service nfslock restart
-                       service nfs restart
+                       echo 0 >/proc/fs/nfsd/threads
+                       service nfs stop > /dev/null 2>&1
+                       service nfslock stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       nfs_dump_some_threads
+                       service nfslock start
+                       service nfs start
                        ;;
                esac
                ;;
@@ -394,6 +597,28 @@ startstop_nfs() {
        esac
 }
 
+# Dump up to the configured number of nfsd thread backtraces.
+nfs_dump_some_threads ()
+{
+    [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0
+
+    # Optimisation to avoid running an unnecessary pidof
+    [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
+
+    _count=0
+    for _pid in $(pidof nfsd) ; do
+       [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
+
+       # Do this first to avoid racing with thread exit
+       _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
+       if [ -n "$_stack" ] ; then
+           echo "Stack trace for stuck nfsd thread [${_pid}]:"
+           echo "$_stack"
+           _count=$(($_count + 1))
+       fi
+    done
+}
+
 ########################################################
 # start/stop the nfs lockmanager service on different platforms
 ########################################################
@@ -417,6 +642,10 @@ startstop_nfslock() {
                stop)
                        service nfsserver stop > /dev/null 2>&1
                        ;;
+               restart)
+                       service nfsserver stop
+                       service nfsserver start
+                       ;;
                esac
                ;;
        rhel)
@@ -427,6 +656,10 @@ startstop_nfslock() {
                stop)
                        service nfslock stop > /dev/null 2>&1
                        ;;
+               restart)
+                       service nfslock stop
+                       service nfslock start
+                       ;;
                esac
                ;;
        *)
@@ -453,7 +686,7 @@ add_ip_to_iface()
        local _iface=$1
        local _ip=$2
        local _maskbits=$3
-       local _state_dir="$CTDB_BASE/state/interface_modify"
+       local _state_dir="$CTDB_VARDIR/state/interface_modify"
        local _lockfile="$_state_dir/$_iface.flock"
        local _readd_base="$_state_dir/$_iface.readd.d"
 
@@ -476,7 +709,7 @@ delete_ip_from_iface()
        local _iface=$1
        local _ip=$2
        local _maskbits=$3
-       local _state_dir="$CTDB_BASE/state/interface_modify"
+       local _state_dir="$CTDB_VARDIR/state/interface_modify"
        local _lockfile="$_state_dir/$_iface.flock"
        local _readd_base="$_state_dir/$_iface.readd.d"
 
@@ -500,7 +733,7 @@ setup_iface_ip_readd_script()
        local _ip=$2
        local _maskbits=$3
        local _readd_script=$4
-       local _state_dir="$CTDB_BASE/state/interface_modify"
+       local _state_dir="$CTDB_VARDIR/state/interface_modify"
        local _lockfile="$_state_dir/$_iface.flock"
        local _readd_base="$_state_dir/$_iface.readd.d"
 
@@ -526,16 +759,17 @@ setup_iface_ip_readd_script()
 # ctdb_check_counter_limit succeeds when count >= <limit>
 ########################################################
 _ctdb_counter_common () {
-    _counter_file="$ctdb_fail_dir/$service_name"
+    _service_name="${1:-${service_name}}"
+    _counter_file="$ctdb_fail_dir/$_service_name"
     mkdir -p "${_counter_file%/*}" # dirname
 }
 ctdb_counter_init () {
-    _ctdb_counter_common
+    _ctdb_counter_common "$1"
 
     >"$_counter_file"
 }
 ctdb_counter_incr () {
-    _ctdb_counter_common
+    _ctdb_counter_common "$1"
 
     # unary counting!
     echo -n 1 >> "$_counter_file"
@@ -549,12 +783,25 @@ ctdb_check_counter_limit () {
     # unary counting!
     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
     if [ $_size -ge $_limit ] ; then
-       echo "ERROR: more than $_limit consecutive failures for $service_name, marking cluster unhealthy"
+       echo "ERROR: more than $_limit consecutive failures for $_service_name, marking cluster unhealthy"
        exit 1
     elif [ $_size -gt 0 -a -z "$_quiet" ] ; then
-       echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet"
+       echo "WARNING: less than $_limit consecutive failures ($_size) for $_service_name, not unhealthy yet"
+    fi
+}
+ctdb_check_counter_equal () {
+    _ctdb_counter_common
+
+    _limit=$1
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    if [ $_size -eq $_limit ] ; then
+       return 1
     fi
+    return 0
 }
+
 ########################################################
 
 ctdb_spool_dir="/var/spool/ctdb"
@@ -615,6 +862,7 @@ ctdb_service_unset_reconfigure ()
 
 ctdb_service_reconfigure ()
 {
+    echo "Reconfiguring service \"$service_name\"..."
     if [ -n "$service_reconfigure" ] ; then
        eval $service_reconfigure
     else
@@ -633,51 +881,62 @@ ctdb_compat_managed_service ()
 
 is_ctdb_managed_service ()
 {
+    _service_name="${1:-${service_name}}"
+
     t=" $CTDB_MANAGED_SERVICES "
 
     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
     ctdb_compat_managed_service "$CTDB_MANAGES_SCP"      "scp"
-    ctdb_compat_managed_service "$CTDB_MANAGES_WINDBIND" "windbind"
+    ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
+    ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
 
-    # Returns 0 if "<space>$service_name<space>" appears in $t
-    [ "${t#* ${service_name} }" != "${t}" ]
+    # Returns 0 if "<space>$_service_name<space>" appears in $t
+    [ "${t#* ${_service_name} }" != "${t}" ]
 }
 
 ctdb_start_stop_service ()
 {
-    _active="$ctdb_active_dir/$service_name"
+    # Do nothing unless configured to...
+    [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
+
+    _service_name="${1:-${service_name}}"
 
-    if is_ctdb_managed_service ; then
+    [ "$event_name" = "monitor" ] || return 0
+
+    _active="$ctdb_active_dir/$_service_name"
+    if is_ctdb_managed_service "$_service_name"; then
        if ! [ -e "$_active" ] ; then
-           echo "Starting service $service_name"
+           echo "Starting service $_service_name"
            ctdb_service_start || exit $?
            mkdir -p "$ctdb_active_dir"
            touch "$_active"
            exit 0
        fi
-    elif ! is_ctdb_managed_service ; then
+    else
        if [ -e "$_active" ] ; then
-           echo "Stopping service $service_name"
+           echo "Stopping service $_service_name"
+           CTDB_AUTOSTOPPING="$_service_name"
            ctdb_service_stop || exit $?
            rm -f "$_active"
+           exit 0
        fi
-       exit 0
     fi
 }
 
 ctdb_service_start ()
 {
     if [ -n "$service_start" ] ; then
-       eval $service_start
+       eval $service_start || return $?
     else
-       service "$service_name" start
+       service "$service_name" start || return $?
     fi
     ctdb_counter_init
+    ctdb_check_tcp_init
 }
 
 ctdb_service_stop ()
@@ -704,74 +963,65 @@ ctdb_standard_event_handler ()
     esac
 }
 
-ipv4_host_addr_to_net_addr()
+# iptables doesn't like being re-entered, so flock-wrap it.
+iptables()
 {
-       local HOST=$1
-       local MASKBITS=$2
-
-       local HOST0=$(echo $HOST | awk -F . '{print $4}')
-       local HOST1=$(echo $HOST | awk -F . '{print $3}')
-       local HOST2=$(echo $HOST | awk -F . '{print $2}')
-       local HOST3=$(echo $HOST | awk -F . '{print $1}')
-
-       local HOST_NUM=$(( $HOST0 + $HOST1 * 256 + $HOST2 * (256 ** 2) + $HOST3 * (256 ** 3) ))
-
-       local MASK_NUM=$(( ( (2**32 - 1) * (2**(32 - $MASKBITS)) ) & (2**32 - 1) ))
-
-       local NET_NUM=$(( $HOST_NUM & $MASK_NUM))
-
-       local NET0=$(( $NET_NUM & 255 ))
-       local NET1=$(( ($NET_NUM & (255 * 256)) / 256 ))
-       local NET2=$(( ($NET_NUM & (255 * 256**2)) / 256**2 ))
-       local NET3=$(( ($NET_NUM & (255 * 256**3)) / 256**3 ))
-
-       echo "$NET3.$NET2.$NET1.$NET0"
+       flock -w 30 /var/ctdb/iptables-ctdb.flock /sbin/iptables "$@"
 }
 
-ipv4_maskbits_to_net_mask()
-{
-       local MASKBITS=$1
-
-       local MASK_NUM=$(( ( (2**32 - 1) * (2**(32 - $MASKBITS)) ) & (2**32 - 1) ))
-
-       local MASK0=$(( $MASK_NUM & 255 ))
-       local MASK1=$(( ($MASK_NUM & (255 * 256)) / 256 ))
-       local MASK2=$(( ($MASK_NUM & (255 * 256**2)) / 256**2 ))
-       local MASK3=$(( ($MASK_NUM & (255 * 256**3)) / 256**3 ))
+########################################################
+# tickle handling
+########################################################
 
-       echo "$MASK3.$MASK2.$MASK1.$MASK0"
-}
+# Temporary directory for tickles.
+tickledir="$CTDB_VARDIR/state/tickles"
+mkdir -p "$tickledir"
 
-ipv4_is_valid_addr()
+update_tickles ()
 {
-       local ADDR=$1
-       local fail=0
-
-       local N=`echo $ADDR | sed -e 's/[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*//'`
-       test -n "$N" && fail=1
-
-       local ADDR0=$(echo $ADDR | awk -F . '{print $4}')
-       local ADDR1=$(echo $ADDR | awk -F . '{print $3}')
-       local ADDR2=$(echo $ADDR | awk -F . '{print $2}')
-       local ADDR3=$(echo $ADDR | awk -F . '{print $1}')
-
-       test "$ADDR0" -gt 255 && fail=1
-       test "$ADDR1" -gt 255 && fail=1
-       test "$ADDR2" -gt 255 && fail=1
-       test "$ADDR3" -gt 255 && fail=1
-
-       test x"$fail" != x"0" && {
-               #echo "IPv4: '$ADDR' is not a valid address"
-               return 1;
-       }
+       _port="$1"
+
+       mkdir -p "$tickledir" # Just in case
+
+       # Who am I?
+       _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
+
+       # What public IPs do I hold?
+       _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
+
+       # IPs as a regexp choice
+       _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
+
+       # Record connections to our public IPs in a temporary file
+       _my_connections="${tickledir}/${_port}.connections"
+       rm -f "$_my_connections"
+       netstat -tn |
+       awk -v destpat="^${_ipschoice}:${_port}\$" \
+         '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
+       sort >"$_my_connections"
+
+       # Record our current tickles in a temporary file
+       _my_tickles="${tickledir}/${_port}.tickles"
+       rm -f "$_my_tickles"
+       for _i in $_ips ; do
+               ctdb -Y gettickles $_i $_port | 
+               awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
+       done |
+       sort >"$_my_tickles"
+
+       # Add tickles for connections that we haven't already got tickles for
+       comm -23 "$_my_connections" "$_my_tickles" |
+       while read _src _dst ; do
+               ctdb addtickle $_src $_dst
+       done
 
-       return 0;
-}
+       # Remove tickles for connections that are no longer there
+       comm -13 "$_my_connections" "$_my_tickles" |
+       while read _src _dst ; do
+               ctdb deltickle $_src $_dst
+       done
 
-# iptables doesn't like being re-entered, so flock-wrap it.
-iptables()
-{
-       flock -w 30 /var/ctdb/iptables-ctdb.flock /sbin/iptables "$@"
+       rm -f "$_my_connections" "$_my_tickles" 
 }
 
 ########################################################
@@ -791,3 +1041,4 @@ iptables()
 script_name="${0##*/}"       # basename
 service_name="$script_name"  # default is just the script name
 service_fail_limit=1
+event_name="$1"