ctdb-scripts: Add new NFS service checking infrastructure
[samba.git] / ctdb / config / functions
1 # Hey Emacs, this is a -*- shell-script -*- !!!
2
3 # utility functions for ctdb event scripts
4
5 [ -z "$CTDB_VARDIR" ] && {
6     if [ -d "/var/lib/ctdb" ] ; then
7         export CTDB_VARDIR="/var/lib/ctdb"
8     else
9         export CTDB_VARDIR="/var/ctdb"
10     fi
11 }
12 [ -z "$CTDB_ETCDIR" ] && {
13     export CTDB_ETCDIR="/etc"
14 }
15
16 #######################################
17 # pull in a system config file, if any
18 _loadconfig() {
19
20     if [ -z "$1" ] ; then
21         foo="${service_config:-${service_name}}"
22         if [ -n "$foo" ] ; then
23             loadconfig "$foo"
24             return
25         fi
26     fi
27
28     if [ "$1" != "ctdb" ] ; then
29         loadconfig "ctdb"
30     fi
31
32     if [ -z "$1" ] ; then
33         return
34     fi
35
36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
37         . $CTDB_ETCDIR/sysconfig/$1
38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
39         . $CTDB_ETCDIR/default/$1
40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
41         . $CTDB_BASE/sysconfig/$1
42     fi
43
44     if [ "$1" = "ctdb" ] ; then
45         _config="${CTDB_BASE}/ctdbd.conf"
46         if [ -r "$_config" ] ; then
47             . "$_config"
48         fi
49     fi
50 }
51
52 loadconfig () {
53     _loadconfig "$@"
54 }
55
56 ##############################################################
57
58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
59 # configuration file.
60 debug ()
61 {
62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
63         # If there are arguments then echo them.  Otherwise expect to
64         # use stdin, which allows us to pass lots of debug using a
65         # here document.
66         if [ -n "$1" ] ; then
67             echo "DEBUG: $*"
68         else
69             sed -e 's@^@DEBUG: @'
70         fi
71     else
72         if [ -z "$1" ] ; then
73             cat >/dev/null
74         fi
75     fi
76 }
77
78 die ()
79 {
80     _msg="$1"
81     _rc="${2:-1}"
82
83     echo "$_msg"
84     exit $_rc
85 }
86
87 # Log given message or stdin to either syslog or a CTDB log file
88 # $1 is the tag passed to logger if syslog is in use.
89 script_log ()
90 {
91     _tag="$1" ; shift
92
93     case "$CTDB_LOGGING" in
94         file:*|"")
95             if [ -n "$CTDB_LOGGING" ] ; then
96                 _file="${CTDB_LOGGING#file:}"
97             else
98                 _file="/var/log/log.ctdb"
99             fi
100             {
101                 if [ -n "$*" ] ; then
102                     echo "$*"
103                 else
104                     cat
105                 fi
106             } >>"$_file"
107             ;;
108         *)
109             # Handle all syslog:* variants here too.  There's no tool to do
110             # the lossy things, so just use logger.
111             logger -t "ctdbd: ${_tag}" $*
112             ;;
113     esac
114 }
115
116 # When things are run in the background in an eventscript then logging
117 # output might get lost.  This is the "solution".  :-)
118 background_with_logging ()
119 {
120     (
121         "$@" 2>&1 </dev/null |
122         script_log "${script_name}&"
123     )&
124
125     return 0
126 }
127
128 ##############################################################
129 # check number of args for different events
130 ctdb_check_args ()
131 {
132     case "$1" in
133         takeip|releaseip)
134             if [ $# != 4 ]; then
135                 echo "ERROR: must supply interface, IP and maskbits"
136                 exit 1
137             fi
138             ;;
139         updateip)
140             if [ $# != 5 ]; then
141                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
142                 exit 1
143             fi
144             ;;
145     esac
146 }
147
148 ##############################################################
149 # determine on what type of system (init style) we are running
150 detect_init_style()
151 {
152     # only do detection if not already set:
153     [ -z "$CTDB_INIT_STYLE" ] || return
154
155     if [ -x /sbin/startproc ]; then
156         CTDB_INIT_STYLE="suse"
157     elif [ -x /sbin/start-stop-daemon ]; then
158         CTDB_INIT_STYLE="debian"
159     else
160         CTDB_INIT_STYLE="redhat"
161     fi
162 }
163
164 ######################################################
165 # simulate /sbin/service on platforms that don't have it
166 # _service() makes it easier to hook the service() function for
167 # testing.
168 _service ()
169 {
170   _service_name="$1"
171   _op="$2"
172
173   # do nothing, when no service was specified
174   [ -z "$_service_name" ] && return
175
176   if [ -x /sbin/service ]; then
177       $_nice /sbin/service "$_service_name" "$_op"
178   elif [ -x /usr/sbin/service ]; then
179       $_nice /usr/sbin/service "$_service_name" "$_op"
180   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
181       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
182   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
183       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
184   fi
185 }
186
187 service()
188 {
189     _nice=""
190     _service "$@"
191 }
192
193 ######################################################
194 # simulate /sbin/service (niced) on platforms that don't have it
195 nice_service()
196 {
197     _nice="nice"
198     _service "$@"
199 }
200
201 ######################################################
202 # Cached retrieval of PNN from local node.  This never changes so why
203 # open a client connection to the server each time this is needed?
204 # This sets $pnn - this avoid an unnecessary subprocess.
205 ctdb_get_pnn ()
206 {
207     _pnn_file="$CTDB_VARDIR/state/my-pnn"
208     if [ ! -f "$_pnn_file" ] ; then
209         ctdb pnn | sed -e 's@.*:@@' >"$_pnn_file"
210     fi
211
212     read pnn <"$_pnn_file"
213 }
214
215 ######################################################
216 # wrapper around /proc/ settings to allow them to be hooked
217 # for testing
218 # 1st arg is relative path under /proc/, 2nd arg is value to set
219 set_proc ()
220 {
221     echo "$2" >"/proc/$1"
222 }
223
224 set_proc_maybe ()
225 {
226     if [ -w "/proc/$1" ] ; then
227         set_proc "$1" "$2"
228     fi
229 }
230
231 ######################################################
232 # wrapper around getting file contents from /proc/ to allow
233 # this to be hooked for testing
234 # 1st arg is relative path under /proc/
235 get_proc ()
236 {
237     cat "/proc/$1"
238 }
239
240 ######################################################
241 # Print up to $_max kernel stack traces for processes named $_program
242 program_stack_traces ()
243 {
244     _prog="$1"
245     _max="${2:-1}"
246
247     _count=1
248     for _pid in $(pidof "$_prog") ; do
249         [ $_count -le $_max ] || break
250
251         # Do this first to avoid racing with process exit
252         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
253         if [ -n "$_stack" ] ; then
254             echo "Stack trace for ${_prog}[${_pid}]:"
255             echo "$_stack"
256             _count=$(($_count + 1))
257         fi
258     done
259 }
260
261 ######################################################
262 # Check that an RPC service is healthy -
263 # this includes allowing a certain number of failures
264 # before marking the NFS service unhealthy.
265 #
266 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
267 #
268 # each triple is a set of 3 arguments: an operator, a 
269 # fail count limit and an action string.
270 #
271 # For example:
272 #
273 #       nfs_check_rpc_service "lockd" \
274 #           -ge 15 "verbose restart unhealthy" \
275 #           -eq 10 "restart:bs"
276 #
277 # says that if lockd is down for 15 iterations then do
278 # a verbose restart of lockd and mark the node unhealthy.
279 # Before this, after 10 iterations of failure, the
280 # service is restarted silently in the background.
281 # Order is important: the number of failures need to be
282 # specified in reverse order because processing stops
283 # after the first condition that is true.
284 ######################################################
285 nfs_check_rpc_service ()
286 {
287     _prog_name="$1" ; shift
288
289     if _nfs_check_rpc_common "$_prog_name" ; then
290         return
291     fi
292
293     while [ -n "$3" ] ; do
294         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
295             break
296         fi
297         shift 3
298     done
299 }
300
301 # The new way of doing things...
302 nfs_check_rpc_services ()
303 {
304     # Files must end with .check - avoids editor backups, RPM fu, ...
305     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
306         _t="${_f%.check}"
307         _prog_name="${_t##*/[0-9][0-9].}"
308
309         # If $_prog_name contains '@' then the bit after it is the
310         # address family.
311         _family="${_prog_name#*@}"
312         if [ "$_family" = "$_prog_name" ] ; then
313             _family=""
314         else
315             _prog_name="${_prog_name%@*}"
316         fi
317
318         if _nfs_check_rpc_common "$_prog_name" "$_family" ; then
319             # This RPC service is up, check next service...
320             continue
321         fi
322
323         # Check each line in the file in turn until one of the limit
324         # checks is hit...
325         while read _cmp _lim _rest ; do
326             # Skip comments
327             case "$_cmp" in
328                 \#*) continue ;;
329             esac
330
331             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
332                 # Limit was hit on this line, no further checking...
333                 break
334             fi
335         done <"$_f"
336     done
337 }
338
339 _nfs_check_rpc_common ()
340 {
341     _prog_name="$1"
342     _family="$2"
343
344     # Some platforms don't have separate programs for all services.
345     case "$_prog_name" in
346         statd)
347             type "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
348     esac
349
350     case "$_prog_name" in
351         nfsd)
352             _rpc_prog=nfs
353             _version=3
354             ;;
355         mountd)
356             _rpc_prog=mountd
357             _version=1
358             ;;
359         rquotad)
360             _rpc_prog=rquotad
361             _version=1
362             ;;
363         lockd)
364             _rpc_prog=nlockmgr
365             _version=4
366             ;;
367         statd)
368             _rpc_prog=status
369             _version=1
370             ;;
371         *)
372             echo "Internal error: unknown RPC program \"$_prog_name\"."
373             exit 1
374     esac
375
376     _service_name="nfs_${_prog_name}${_family:+_}${_family}"
377
378     if ctdb_check_rpc "$_rpc_prog" "$_version" "$_family" >/dev/null ; then
379         ctdb_counter_init "$_service_name"
380         return 0
381     fi
382
383     ctdb_counter_incr "$_service_name"
384
385     return 1
386 }
387
388 _nfs_check_rpc_action ()
389 {
390     _cmp="$1"
391     _limit="$2"
392     _actions="$3"
393
394     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
395         return 1
396     fi
397
398     for _action in $_actions ; do
399         case "$_action" in
400             verbose)
401                 echo "ERROR: $ctdb_check_rpc_out"
402                 ;;
403             restart)
404                 _nfs_restart_rpc_service "$_prog_name"
405                 ;;
406             restart:b)
407                 _nfs_restart_rpc_service "$_prog_name" true
408                 ;;
409             unhealthy)
410                 exit 1
411                 ;;
412             *)
413                 echo "Internal error: unknown action \"$_action\"."
414                 exit 1
415         esac
416     done
417
418     return 0
419 }
420
421 _nfs_restart_rpc_service ()
422 {
423     _prog_name="$1"
424     _background="${2:-false}"
425
426     if $_background ; then
427         _maybe_background="background_with_logging"
428     else
429         _maybe_background=""
430     fi
431
432     _p="rpc.${_prog_name}"
433
434     case "$_prog_name" in
435         nfsd)
436             echo "Trying to restart NFS service"
437             $_maybe_background startstop_nfs restart
438             ;;
439         mountd)
440             echo "Trying to restart $_prog_name [${_p}]"
441             killall -q -9 "$_p"
442             nfs_dump_some_threads "$_p"
443             $_maybe_background $_p $RPCMOUNTDOPTS \
444                                ${MOUNTD_PORT:+-p} $MOUNTD_PORT
445             ;;
446         rquotad)
447             echo "Trying to restart $_prog_name [${_p}]"
448             killall -q -9 "$_p"
449             nfs_dump_some_threads "$_p"
450             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
451             ;;
452         lockd)
453             echo "Trying to restart lock manager service"
454             $_maybe_background startstop_nfslock restart
455             ;;
456         statd)
457             echo "Trying to restart $_prog_name [${_p}]"
458             killall -q -9 "$_p"
459             nfs_dump_some_threads "$_p"
460             $_maybe_background $_p \
461                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
462                 ${STATD_PORT:+-p} $STATD_PORT \
463                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
464             ;;
465         *)
466             echo "Internal error: unknown RPC program \"$_prog_name\"."
467             exit 1
468     esac
469 }
470
471 ######################################################
472 # Check the health of NFS services
473 #
474 # Use .check files in given directory.
475 # Default is "${CTDB_BASE}/nfs-checks.d/"
476 ######################################################
477 nfs_check_services ()
478 {
479     _dir="${1:-${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}}"
480
481     # Files must end with .check - avoids editor backups, RPM fu, ...
482     for _f in "$_dir"/[0-9][0-9].*.check ; do
483         _t="${_f%.check}"
484         _progname="${_t##*/[0-9][0-9].}"
485
486         nfs_check_service "$_progname" <"$_f"
487     done
488 }
489
490 ######################################################
491 # Check the health of an NFS service
492 #
493 # $1 - progname, passed to rpcinfo (looked up in /etc/rpc)
494 #
495 # Reads variables from stdin
496 #
497 # Variables are:
498 #
499 # * family             - "tcp" or "udp" or space separated list
500 #                        default: tcp
501 # * version            - optional, RPC service version number
502 #                        default is to omit to check for any version
503 # * unhealthy_after    - number of check fails before unhealthy
504 #                        default: 1
505 # * restart_every      - number of check fails before restart
506 #                        default: 0, meaning no restart
507 # * service_stop_cmd   - command to stop service
508 #                        default: no default, must be provided if
509 #                                 restart_every > 0
510 # * service_start_cmd  - command to start service
511 #                        default: no default, must be provided if
512 #                                 restart_every > 0
513 # * service_debug_cmd  - command to debug a service after trying to stop it;
514 #                        for example, it can be useful to print stack
515 #                        traces of threads that have not exited, since
516 #                        they may be stuck doing I/O;
517 #                        no default, see also function program_stack_traces()
518 #
519 # Quoting in values is not preserved
520 #
521 ######################################################
522 nfs_check_service ()
523 {
524     _progname="$1"
525
526     (
527         # Subshell to restrict scope variables...
528
529         # Defaults
530         family="tcp"
531         version=""
532         unhealthy_after=1
533         restart_every=0
534         service_stop_cmd=""
535         service_start_cmd=""
536         service_debug_cmd=""
537
538         # Eval line-by-line.  Expands variable references in values.
539         # Also allows variable name checking, which seems useful.
540         while read _line ; do
541             case "$_line" in
542                 \#*|"") : ;; # Ignore comments, blank lines
543
544                 family=*|version=*|\
545                 unhealthy_after=*|restart_every=*|\
546                 service_stop_cmd=*|service_start_cmd=*|\
547                 service_debug_cmd=*)
548
549                     eval "$_line"
550                     ;;
551                 *)
552                     echo "ERROR: Unknown variable for ${_progname}: ${_line}"
553                     exit 1
554             esac
555         done
556
557         _service_name="nfs_${_progname}"
558
559         if nfs_check_rpcinfo \
560                "$_progname" "$version" "$family" >/dev/null ; then
561             if [ $unhealthy_after -ne 1 -o $restart_every -ne 0 ] ; then
562                 ctdb_counter_init "$_service_name"
563             fi
564             exit 0
565         fi
566
567         ctdb_counter_incr "$_service_name"
568         _failcount=$(ctdb_counter_get "$_service_name")
569
570         _unhealthy=false
571         if [ $unhealthy_after -gt 0 ] ; then
572             if [ $_failcount -ge $unhealthy_after ] ; then
573                 _unhealthy=true
574                 echo "ERROR: $ctdb_check_rpc_out"
575             fi
576         fi
577
578         if [ $restart_every -gt 0 ] ; then
579             if [ $(($_failcount % $restart_every)) -eq 0 ] ; then
580                 if ! $_unhealthy ; then
581                     echo "WARNING: $ctdb_check_rpc_out"
582                 fi
583                 nfs_restart_service
584             fi
585         fi
586
587         if $_unhealthy ; then
588             exit 1
589         fi
590
591         return 0
592     ) || exit 1
593 }
594
595 # Uses: stop_service, start_service, debug_stuck_threads
596 nfs_restart_service ()
597 {
598     if [ -z "$service_stop_cmd" -o -z "$service_start_cmd" ] ; then
599         die "ERROR: Can not restart service \"${_progname}\" without corresponding service_start_cmd/service_stop_cmd settings"
600     fi
601
602     echo "Trying to restart service \"${_progname}\"..."
603     # Using eval means variables can contain semicolon separated commands
604     eval "$service_stop_cmd"
605     if [ -n "$service_debug_cmd" ] ; then
606         eval "$service_debug_cmd"
607     fi
608     background_with_logging eval "$service_start_cmd"
609 }
610
611 ######################################################
612 # Check an RPC service with rpcinfo
613 ######################################################
614 ctdb_check_rpc ()
615 {
616     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
617     _version="$2"         # optional, not passed if empty/unset
618     _family="${3:-tcp}"   # optional, default is "tcp"
619
620     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
621
622     if ! ctdb_check_rpc_out=$(rpcinfo -T $_family $_localhost \
623                                       $_progname $_version 2>&1) ; then
624         ctdb_check_rpc_out="$_progname failed RPC check:
625 $ctdb_check_rpc_out"
626         echo "$ctdb_check_rpc_out"
627         return 1
628     fi
629 }
630
631 nfs_check_rpcinfo ()
632 {
633     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
634     _versions="$2"        # optional, space separated, not passed if empty/unset
635     _families="${3:-tcp}" # optional, space separated, default is "tcp"
636
637     for _family in $_families ; do
638         if [ -n "$_versions" ] ; then
639             for _version in $_versions ; do
640                 ctdb_check_rpc $_progname $_version $_family || return $?
641             done
642         else
643             ctdb_check_rpc $_progname "" $_family || return $?
644         fi
645     done
646 }
647
648 ######################################################
649 # Ensure $service_name is set
650 assert_service_name ()
651 {
652     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
653 }
654
655 ######################################################
656 # check a set of directories is available
657 # return 1 on a missing directory
658 # directories are read from stdin
659 ######################################################
660 ctdb_check_directories_probe()
661 {
662     while IFS="" read d ; do
663         case "$d" in
664             *%*)
665                 continue
666                 ;;
667             *)
668                 [ -d "${d}/." ] || return 1
669         esac
670     done
671 }
672
673 ######################################################
674 # check a set of directories is available
675 # directories are read from stdin
676 ######################################################
677 ctdb_check_directories()
678 {
679     ctdb_check_directories_probe || {
680         echo "ERROR: $service_name directory \"$d\" not available"
681         exit 1
682     }
683 }
684
685 ######################################################
686 # check a set of tcp ports
687 # usage: ctdb_check_tcp_ports <ports...>
688 ######################################################
689
690 # This flag file is created when a service is initially started.  It
691 # is deleted the first time TCP port checks for that service succeed.
692 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
693 # message if a port check fails.
694 _ctdb_check_tcp_common ()
695 {
696     assert_service_name
697     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
698 }
699
700 ctdb_check_tcp_init ()
701 {
702     _ctdb_check_tcp_common
703     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
704     touch "$_ctdb_service_started_file"
705 }
706
707 # Check whether something is listening on all of the given TCP ports
708 # using the "ctdb checktcpport" command.
709 ctdb_check_tcp_ports()
710 {
711     if [ -z "$1" ] ; then
712         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
713         exit 1
714     fi
715
716     for _p ; do  # process each function argument (port)
717         _cmd="ctdb checktcpport $_p"
718         _out=$($_cmd 2>&1)
719         _ret=$?
720         case "$_ret" in
721             0)
722                 _ctdb_check_tcp_common
723                 if [ ! -f "$_ctdb_service_started_file" ] ; then
724                     echo "ERROR: $service_name tcp port $_p is not responding"
725                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
726                 else
727                     echo "INFO: $service_name tcp port $_p is not responding"
728                 fi
729
730                 return 1
731                 ;;
732             98)
733                 # Couldn't bind, something already listening, next port...
734                 continue
735                 ;;
736             *)
737                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
738                 debug <<EOF
739 ctdb checktcpport (exited with $_ret) with output:
740 $_out"
741 EOF
742                 return $_ret
743         esac
744     done
745
746     # All ports listening
747     _ctdb_check_tcp_common
748     rm -f "$_ctdb_service_started_file"
749     return 0
750 }
751
752 ######################################################
753 # check a unix socket
754 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
755 ######################################################
756 ctdb_check_unix_socket() {
757     socket_path="$1"
758     [ -z "$socket_path" ] && return
759
760     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
761         echo "ERROR: $service_name socket $socket_path not found"
762         return 1
763     fi
764 }
765
766 ######################################################
767 # check a command returns zero status
768 # usage: ctdb_check_command <command>
769 ######################################################
770 ctdb_check_command ()
771 {
772     _out=$("$@" 2>&1) || {
773         echo "ERROR: $* returned error"
774         echo "$_out" | debug
775         exit 1
776     }
777 }
778
779 ################################################
780 # kill off any TCP connections with the given IP
781 ################################################
782 kill_tcp_connections ()
783 {
784     _ip="$1"
785
786     _oneway=false
787     if [ "$2" = "oneway" ] ; then
788         _oneway=true
789     fi
790
791     get_tcp_connections_for_ip "$_ip" | {
792         _killcount=0
793         _connections=""
794         _nl="
795 "
796         while read _dst _src; do
797             _destport="${_dst##*:}"
798             __oneway=$_oneway
799             case $_destport in
800                 # we only do one-way killtcp for CIFS
801                 139|445) __oneway=true ;;
802             esac
803
804             echo "Killing TCP connection $_src $_dst"
805             _connections="${_connections}${_nl}${_src} ${_dst}"
806             if ! $__oneway ; then
807                 _connections="${_connections}${_nl}${_dst} ${_src}"
808             fi
809
810             _killcount=$(($_killcount + 1))
811         done
812
813         if [ $_killcount -eq 0 ] ; then
814             return
815         fi
816
817         echo "$_connections" | ctdb killtcp || {
818             echo "Failed to send killtcp control"
819             return
820         }
821
822         _count=0
823         while : ; do
824             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
825
826             if [ $_remaining -eq 0 ] ; then
827                 echo "Killed $_killcount TCP connections to released IP $_ip"
828                 return
829             fi
830
831             _count=$(($_count + 1))
832             if [ $_count -gt 3 ] ; then
833                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
834                 return
835             fi
836
837             echo "Waiting for $_remaining connections to be killed for IP $_ip"
838             sleep 1
839         done
840     }
841 }
842
843 ##################################################################
844 # kill off the local end for any TCP connections with the given IP
845 ##################################################################
846 kill_tcp_connections_local_only ()
847 {
848     kill_tcp_connections "$1" "oneway"
849 }
850
851 ##################################################################
852 # tickle any TCP connections with the given IP
853 ##################################################################
854 tickle_tcp_connections ()
855 {
856     _ip="$1"
857
858     get_tcp_connections_for_ip "$_ip" |
859     {
860         _failed=false
861
862         while read dest src; do
863             echo "Tickle TCP connection $src $dest"
864             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
865             echo "Tickle TCP connection $dest $src"
866             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
867         done
868
869         if $_failed ; then
870             echo "Failed to send tickle control"
871         fi
872     }
873 }
874
875 get_tcp_connections_for_ip ()
876 {
877     _ip="$1"
878
879     netstat -tn | awk -v ip=$_ip \
880         'index($1, "tcp") == 1 && \
881          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
882          && $6 == "ESTABLISHED" \
883          {print $4" "$5}'
884 }
885
886 ##################################################################
887 # use statd-callout to update NFS lock info
888 ##################################################################
889 nfs_update_lock_info ()
890 {
891     if [ -x "$CTDB_BASE/statd-callout" ] ; then
892         "$CTDB_BASE/statd-callout" update
893     fi
894 }
895
896 ########################################################
897 # start/stop the Ganesha nfs service
898 ########################################################
899 startstop_ganesha()
900 {
901     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
902     case "$1" in
903         start)
904             service "$_service_name" start
905             ;;
906         stop)
907             service "$_service_name" stop
908             ;;
909         restart)
910             service "$_service_name" stop
911             nfs_dump_some_threads "rpc.statd"
912             service "$_service_name" start
913             ;;
914     esac
915 }
916
917 ########################################################
918 # start/stop the nfs service on different platforms
919 ########################################################
920 startstop_nfs() {
921         PLATFORM="unknown"
922         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
923                 PLATFORM="sles"
924         }
925         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
926             -r /usr/lib/systemd/system/nfs-lock.service ] && {
927                 PLATFORM="rhel"
928         }
929
930         case $PLATFORM in
931         sles)
932                 case $1 in
933                 start)
934                         service nfsserver start
935                         ;;
936                 stop)
937                         service nfsserver stop > /dev/null 2>&1
938                         ;;
939                 restart)
940                         set_proc "fs/nfsd/threads" 0
941                         service nfsserver stop > /dev/null 2>&1
942                         pkill -9 nfsd
943                         nfs_dump_some_threads
944                         service nfsserver start
945                         ;;
946                 esac
947                 ;;
948         rhel)
949                 case $1 in
950                 start)
951                         service nfslock start
952                         service nfs start
953                         ;;
954                 stop)
955                         service nfs stop
956                         service nfslock stop
957                         ;;
958                 restart)
959                         set_proc "fs/nfsd/threads" 0
960                         service nfs stop > /dev/null 2>&1
961                         service nfslock stop > /dev/null 2>&1
962                         pkill -9 nfsd
963                         nfs_dump_some_threads
964                         service nfslock start
965                         service nfs start
966                         ;;
967                 esac
968                 ;;
969         *)
970                 echo "Unknown platform. NFS is not supported with ctdb"
971                 exit 1
972                 ;;
973         esac
974 }
975
976 # Dump up to the configured number of nfsd thread backtraces.
977 nfs_dump_some_threads ()
978 {
979     _prog="${1:-nfsd}"
980
981     _num="${CTDB_NFS_DUMP_STUCK_THREADS:-5}"
982     [ $_num -gt 0 ] || return 0
983
984     program_stack_traces "$_prog" $_num
985 }
986
987 ########################################################
988 # start/stop the nfs lockmanager service on different platforms
989 ########################################################
990 startstop_nfslock() {
991         PLATFORM="unknown"
992         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
993                 PLATFORM="sles"
994         }
995         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
996             -r /usr/lib/systemd/system/nfs-lock.service ] && {
997                 PLATFORM="rhel"
998         }
999
1000         case $PLATFORM in
1001         sles)
1002                 # for sles there is no service for lockmanager
1003                 # so we instead just shutdown/restart nfs
1004                 case $1 in
1005                 start)
1006                         service nfsserver start
1007                         ;;
1008                 stop)
1009                         service nfsserver stop > /dev/null 2>&1
1010                         ;;
1011                 restart)
1012                         service nfsserver stop > /dev/null 2>&1
1013                         service nfsserver start
1014                         ;;
1015                 esac
1016                 ;;
1017         rhel)
1018                 case $1 in
1019                 start)
1020                         service nfslock start
1021                         ;;
1022                 stop)
1023                         service nfslock stop > /dev/null 2>&1
1024                         ;;
1025                 restart)
1026                         service nfslock stop > /dev/null 2>&1
1027                         service nfslock start
1028                         ;;
1029                 esac
1030                 ;;
1031         *)
1032                 echo "Unknown platform. NFS locking is not supported with ctdb"
1033                 exit 1
1034                 ;;
1035         esac
1036 }
1037
1038 ########################################################
1039
1040 add_ip_to_iface ()
1041 {
1042     _iface=$1
1043     _ip=$2
1044     _maskbits=$3
1045
1046     # Ensure interface is up
1047     ip link set "$_iface" up || \
1048         die "Failed to bringup interface $_iface"
1049
1050     # Only need to define broadcast for IPv4
1051     case "$ip" in
1052         *:*) _bcast=""      ;;
1053         *)   _bcast="brd +" ;;
1054     esac
1055
1056     ip addr add "$_ip/$_maskbits" $_bcast dev "$_iface" || {
1057         echo "Failed to add $_ip/$_maskbits on dev $_iface"
1058         return 1
1059     }
1060
1061     # Wait 5 seconds for IPv6 addresses to stop being tentative...
1062     if [ -z "$_bcast" ] ; then
1063         for _x in $(seq 1 10) ; do
1064             ip addr show to "${_ip}/128" | grep -q "tentative" || break
1065             sleep 0.5
1066         done
1067
1068         # If the address was a duplicate then it won't be on the
1069         # interface so flag an error.
1070         _t=$(ip addr show to "${_ip}/128")
1071         case "$_t" in
1072             "")
1073                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
1074                 return 1
1075                 ;;
1076             *tentative*|*dadfailed*)
1077                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
1078                 ip addr del "$_ip/$_maskbits" dev "$_iface"
1079                 return 1
1080                 ;;
1081         esac
1082     fi
1083 }
1084
1085 delete_ip_from_iface()
1086 {
1087     _iface=$1
1088     _ip=$2
1089     _maskbits=$3
1090
1091     # This could be set globally for all interfaces but it is probably
1092     # better to avoid surprises, so limit it the interfaces where CTDB
1093     # has public IP addresses.  There isn't anywhere else convenient
1094     # to do this so just set it each time.  This is much cheaper than
1095     # remembering and re-adding secondaries.
1096     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
1097
1098     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
1099         echo "Failed to del $_ip on dev $_iface"
1100         return 1
1101     }
1102 }
1103
1104 # If the given IP is hosted then print 2 items: maskbits and iface
1105 ip_maskbits_iface ()
1106 {
1107     _addr="$1"
1108
1109     case "$_addr" in
1110         *:*) _family="inet6" ; _bits=128 ;;
1111         *)   _family="inet"  ; _bits=32  ;;
1112     esac
1113
1114     ip addr show to "${_addr}/${_bits}" 2>/dev/null | \
1115         awk -v family="${_family}" \
1116             'NR == 1 { iface = $2; sub(":$", "", iface) ; \
1117                        sub("@.*", "", iface) } \
1118              $1 ~ /inet/ { mask = $2; sub(".*/", "", mask); \
1119                            print mask, iface, family }'
1120 }
1121
1122 drop_ip ()
1123 {
1124     _addr="${1%/*}"  # Remove optional maskbits
1125
1126     set -- $(ip_maskbits_iface $_addr)
1127     if [ -n "$1" ] ; then
1128         _maskbits="$1"
1129         _iface="$2"
1130         echo "Removing public address $_addr/$_maskbits from device $_iface"
1131         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
1132     fi
1133 }
1134
1135 drop_all_public_ips ()
1136 {
1137     while read _ip _x ; do
1138         drop_ip "$_ip"
1139     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
1140 }
1141
1142 flush_route_cache ()
1143 {
1144     set_proc_maybe sys/net/ipv4/route/flush 1
1145     set_proc_maybe sys/net/ipv6/route/flush 1
1146 }
1147
1148 ########################################################
1149 # Simple counters
1150 _ctdb_counter_common () {
1151     _service_name="${1:-${service_name:-${script_name}}}"
1152     _counter_file="$ctdb_fail_dir/$_service_name"
1153     mkdir -p "${_counter_file%/*}" # dirname
1154 }
1155 ctdb_counter_init () {
1156     _ctdb_counter_common "$1"
1157
1158     >"$_counter_file"
1159 }
1160 ctdb_counter_incr () {
1161     _ctdb_counter_common "$1"
1162
1163     # unary counting!
1164     echo -n 1 >> "$_counter_file"
1165 }
1166 ctdb_counter_get () {
1167     _ctdb_counter_common "$1"
1168     # unary counting!
1169     stat -c "%s" "$_counter_file" 2>/dev/null || echo 0
1170 }
1171 ctdb_check_counter () {
1172     _msg="${1:-error}"  # "error"  - anything else is silent on fail
1173     _op="${2:--ge}"  # an integer operator supported by test
1174     _limit="${3:-${service_fail_limit}}"
1175     shift 3
1176
1177     _size=$(ctdb_counter_get "$1")
1178
1179     _hit=false
1180     if [ "$_op" != "%" ] ; then
1181         if [ $_size $_op $_limit ] ; then
1182             _hit=true
1183         fi
1184     else
1185         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
1186             _hit=true
1187         fi
1188     fi
1189     if $_hit ; then
1190         if [ "$_msg" = "error" ] ; then
1191             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
1192             exit 1              
1193         else
1194             return 1
1195         fi
1196     fi
1197 }
1198
1199 ########################################################
1200
1201 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
1202 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
1203
1204 ctdb_setup_service_state_dir ()
1205 {
1206     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
1207     mkdir -p "$service_state_dir" || {
1208         echo "Error creating state dir \"$service_state_dir\""
1209         exit 1
1210     }
1211 }
1212
1213 ########################################################
1214 # Managed status history, for auto-start/stop
1215
1216 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
1217
1218 _ctdb_managed_common ()
1219 {
1220     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
1221 }
1222
1223 ctdb_service_managed ()
1224 {
1225     _ctdb_managed_common
1226     mkdir -p "$ctdb_managed_dir"
1227     touch "$_ctdb_managed_file"
1228 }
1229
1230 ctdb_service_unmanaged ()
1231 {
1232     _ctdb_managed_common
1233     rm -f "$_ctdb_managed_file"
1234 }
1235
1236 is_ctdb_previously_managed_service ()
1237 {
1238     _ctdb_managed_common
1239     [ -f "$_ctdb_managed_file" ]
1240 }
1241
1242 ########################################################
1243 # Check and set status
1244
1245 log_status_cat ()
1246 {
1247     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
1248 }
1249
1250 ctdb_checkstatus ()
1251 {
1252     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1253         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1254         return 1
1255     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1256         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1257         return 2
1258     else
1259         return 0
1260     fi
1261 }
1262
1263 ctdb_setstatus ()
1264 {
1265     d="$ctdb_status_dir/$script_name"
1266     case "$1" in
1267         unhealthy|banned)
1268             mkdir -p "$d"
1269             cat "$2" >"$d/$1"
1270             ;;
1271         *)
1272             for i in "banned" "unhealthy" ; do
1273                 rm -f "$d/$i"
1274             done
1275             ;;
1276     esac
1277 }
1278
1279 ##################################################################
1280 # Reconfigure a service on demand
1281
1282 _ctdb_service_reconfigure_common ()
1283 {
1284     _d="$ctdb_status_dir/${service_name}"
1285     mkdir -p "$_d"
1286     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1287 }
1288
1289 ctdb_service_needs_reconfigure ()
1290 {
1291     _ctdb_service_reconfigure_common
1292     [ -e "$_ctdb_service_reconfigure_flag" ]
1293 }
1294
1295 ctdb_service_set_reconfigure ()
1296 {
1297     _ctdb_service_reconfigure_common
1298     >"$_ctdb_service_reconfigure_flag"
1299 }
1300
1301 ctdb_service_unset_reconfigure ()
1302 {
1303     _ctdb_service_reconfigure_common
1304     rm -f "$_ctdb_service_reconfigure_flag"
1305 }
1306
1307 ctdb_service_reconfigure ()
1308 {
1309     echo "Reconfiguring service \"${service_name}\"..."
1310     ctdb_service_unset_reconfigure
1311     service_reconfigure || return $?
1312     ctdb_counter_init
1313 }
1314
1315 # Default service_reconfigure() function does nothing.
1316 service_reconfigure ()
1317 {
1318     :
1319 }
1320
1321 ctdb_reconfigure_take_lock ()
1322 {
1323     _ctdb_service_reconfigure_common
1324     _lock="${_d}/reconfigure_lock"
1325     mkdir -p "${_lock%/*}" # dirname
1326     touch "$_lock"
1327
1328     (
1329         flock 0
1330         # This is overkill but will work if we need to extend this to
1331         # allow certain events to run multiple times in parallel
1332         # (e.g. takeip) and write multiple PIDs to the file.
1333         read _locker_event 
1334         if [ -n "$_locker_event" ] ; then
1335             while read _pid ; do
1336                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1337                     kill -0 "$_pid" 2>/dev/null ; then
1338                     exit 1
1339                 fi
1340             done
1341         fi
1342
1343         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1344         exit 0
1345     ) <"$_lock"
1346 }
1347
1348 ctdb_reconfigure_release_lock ()
1349 {
1350     _ctdb_service_reconfigure_common
1351     _lock="${_d}/reconfigure_lock"
1352
1353     rm -f "$_lock"
1354 }
1355
1356 ctdb_replay_monitor_status ()
1357 {
1358     echo "Replaying previous status for this script due to reconfigure..."
1359     # Leading separator ('|') is missing in some versions...
1360     _out=$(ctdb scriptstatus -X | grep -E "^\|?monitor\|${script_name}\|")
1361     # Output looks like this:
1362     # |monitor|60.nfs|1|ERROR|1314764004.030861|1314764004.035514|foo bar|
1363     # This is the cheapest way of getting fields in the middle.
1364     set -- $(IFS="|" ; echo $_out)
1365     _code="$3"
1366     _status="$4"
1367     # The error output field can include colons so we'll try to
1368     # preserve them.  The weak checking at the beginning tries to make
1369     # this work for both broken (no leading '|') and fixed output.
1370     _out="${_out%|}"
1371     _err_out="${_out#*monitor|${script_name}|*|*|*|*|}"
1372     case "$_status" in
1373         OK) : ;;  # Do nothing special.
1374         TIMEDOUT)
1375             # Recast this as an error, since we can't exit with the
1376             # correct negative number.
1377             _code=1
1378             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1379             ;;
1380         DISABLED)
1381             # Recast this as an OK, since we can't exit with the
1382             # correct negative number.
1383             _code=0
1384             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1385             ;;
1386         *) : ;;  # Must be ERROR, do nothing special.
1387     esac
1388     if [ -n "$_err_out" ] ; then
1389         echo "$_err_out"
1390     fi
1391     exit $_code
1392 }
1393
1394 ctdb_service_check_reconfigure ()
1395 {
1396     assert_service_name
1397
1398     # We only care about some events in this function.  For others we
1399     # return now.
1400     case "$event_name" in
1401         monitor|ipreallocated|reconfigure) : ;;
1402         *) return 0 ;;
1403     esac
1404
1405     if ctdb_reconfigure_take_lock ; then
1406         # No events covered by this function are running, so proceed
1407         # with gay abandon.
1408         case "$event_name" in
1409             reconfigure)
1410                 (ctdb_service_reconfigure)
1411                 exit $?
1412                 ;;
1413             ipreallocated)
1414                 if ctdb_service_needs_reconfigure ; then
1415                     ctdb_service_reconfigure
1416                 fi
1417                 ;;
1418         esac
1419
1420         ctdb_reconfigure_release_lock
1421     else
1422         # Somebody else is running an event we don't want to collide
1423         # with.  We proceed with caution.
1424         case "$event_name" in
1425             reconfigure)
1426                 # Tell whoever called us to retry.
1427                 exit 2
1428                 ;;
1429             ipreallocated)
1430                 # Defer any scheduled reconfigure and just run the
1431                 # rest of the ipreallocated event, as per the
1432                 # eventscript.  There's an assumption here that the
1433                 # event doesn't depend on any scheduled reconfigure.
1434                 # This is true in the current code.
1435                 return 0
1436                 ;;
1437             monitor)
1438                 # There is most likely a reconfigure in progress so
1439                 # the service is possibly unstable.  As above, we
1440                 # defer any scheduled reconfigured.  We also replay
1441                 # the previous monitor status since that's the best
1442                 # information we have.
1443                 ctdb_replay_monitor_status
1444                 ;;
1445         esac
1446     fi
1447 }
1448
1449 ##################################################################
1450 # Does CTDB manage this service? - and associated auto-start/stop
1451
1452 ctdb_compat_managed_service ()
1453 {
1454     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1455         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1456     fi
1457 }
1458
1459 is_ctdb_managed_service ()
1460 {
1461     assert_service_name
1462
1463     # $t is used just for readability and to allow better accurate
1464     # matching via leading/trailing spaces
1465     t=" $CTDB_MANAGED_SERVICES "
1466
1467     # Return 0 if "<space>$service_name<space>" appears in $t
1468     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1469         return 0
1470     fi
1471
1472     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1473     # backward compatibility and try again.
1474     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1475     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1476     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1477     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1478     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1479     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1480     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1481     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1482     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1483
1484     t=" $CTDB_MANAGED_SERVICES "
1485
1486     # Return 0 if "<space>$service_name<space>" appears in $t
1487     [ "${t#* ${service_name} }" != "${t}" ]
1488 }
1489
1490 ctdb_start_stop_service ()
1491 {
1492     assert_service_name
1493
1494     # Allow service-start/service-stop pseudo-events to start/stop
1495     # services when we're not auto-starting/stopping and we're not
1496     # monitoring.
1497     case "$event_name" in
1498         service-start)
1499             if is_ctdb_managed_service ; then
1500                 die 'service-start event not permitted when service is managed'
1501             fi
1502             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1503                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1504             fi
1505             ctdb_service_start
1506             exit $?
1507             ;;
1508         service-stop)
1509             if is_ctdb_managed_service ; then
1510                 die 'service-stop event not permitted when service is managed'
1511             fi
1512             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1513                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1514             fi
1515             ctdb_service_stop
1516             exit $?
1517             ;;
1518     esac
1519
1520     # Do nothing unless configured to...
1521     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1522
1523     [ "$event_name" = "monitor" ] || return 0
1524
1525     if is_ctdb_managed_service ; then
1526         if ! is_ctdb_previously_managed_service ; then
1527             echo "Starting service \"$service_name\" - now managed"
1528             background_with_logging ctdb_service_start
1529             exit $?
1530         fi
1531     else
1532         if is_ctdb_previously_managed_service ; then
1533             echo "Stopping service \"$service_name\" - no longer managed"
1534             background_with_logging ctdb_service_stop
1535             exit $?
1536         fi
1537     fi
1538 }
1539
1540 ctdb_service_start ()
1541 {
1542     # The service is marked managed if we've ever tried to start it.
1543     ctdb_service_managed
1544
1545     service_start || return $?
1546
1547     ctdb_counter_init
1548     ctdb_check_tcp_init
1549 }
1550
1551 ctdb_service_stop ()
1552 {
1553     ctdb_service_unmanaged
1554     service_stop
1555 }
1556
1557 # Default service_start() and service_stop() functions.
1558  
1559 # These may be overridden in an eventscript.
1560 service_start ()
1561 {
1562     service "$service_name" start
1563 }
1564
1565 service_stop ()
1566 {
1567     service "$service_name" stop
1568 }
1569
1570 ##################################################################
1571
1572 ctdb_standard_event_handler ()
1573 {
1574     case "$1" in
1575         status)
1576             ctdb_checkstatus
1577             exit
1578             ;;
1579         setstatus)
1580             shift
1581             ctdb_setstatus "$@"
1582             exit
1583             ;;
1584     esac
1585 }
1586
1587 iptables_wrapper ()
1588 {
1589     _family="$1" ; shift
1590     if [ "$_family" = "inet6" ] ; then
1591         _iptables_cmd="ip6tables"
1592     else
1593         _iptables_cmd="iptables"
1594     fi
1595
1596     # iptables doesn't like being re-entered, so flock-wrap it.
1597     flock -w 30 "${CTDB_VARDIR}/iptables-ctdb.flock" "$_iptables_cmd" "$@"
1598 }
1599
1600 # AIX (and perhaps others?) doesn't have mktemp
1601 if ! type mktemp >/dev/null 2>&1 ; then
1602     mktemp ()
1603     {
1604         _dir=false
1605         if [ "$1" = "-d" ] ; then
1606             _dir=true
1607             shift
1608         fi
1609         _d="${TMPDIR:-/tmp}"
1610         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1611             md5sum | \
1612             sed -e 's@\(..........\).*@\1@')
1613         _t="${_d}/tmp.${_hex10}"
1614         (
1615             umask 077
1616             if $_dir ; then
1617                 mkdir "$_t"
1618             else
1619                 >"$_t"
1620             fi
1621         )
1622         echo "$_t"
1623     }
1624 fi
1625
1626 ########################################################
1627 # tickle handling
1628 ########################################################
1629
1630 update_tickles ()
1631 {
1632         _port="$1"
1633
1634         tickledir="$CTDB_VARDIR/state/tickles"
1635         mkdir -p "$tickledir"
1636
1637         ctdb_get_pnn
1638
1639         # What public IPs do I hold?
1640         _ips=$(ctdb -X ip | awk -F'|' -v pnn=$pnn '$3 == pnn {print $2}')
1641
1642         # IPs as a regexp choice
1643         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1644
1645         # Record connections to our public IPs in a temporary file
1646         _my_connections="${tickledir}/${_port}.connections"
1647         rm -f "$_my_connections"
1648         netstat -tn |
1649         awk -v destpat="^${_ipschoice}:${_port}\$" \
1650           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1651         sort >"$_my_connections"
1652
1653         # Record our current tickles in a temporary file
1654         _my_tickles="${tickledir}/${_port}.tickles"
1655         rm -f "$_my_tickles"
1656         for _i in $_ips ; do
1657                 ctdb -X gettickles $_i $_port |
1658                 awk -F'|' 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1659         done |
1660         sort >"$_my_tickles"
1661
1662         # Add tickles for connections that we haven't already got tickles for
1663         comm -23 "$_my_connections" "$_my_tickles" |
1664         while read _src _dst ; do
1665                 ctdb addtickle $_src $_dst
1666         done
1667
1668         # Remove tickles for connections that are no longer there
1669         comm -13 "$_my_connections" "$_my_tickles" |
1670         while read _src _dst ; do
1671                 ctdb deltickle $_src $_dst
1672         done
1673
1674         rm -f "$_my_connections" "$_my_tickles" 
1675 }
1676
1677 ########################################################
1678 # load a site local config file
1679 ########################################################
1680
1681 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1682         . "$CTDB_RC_LOCAL"
1683 }
1684
1685 [ -x $CTDB_BASE/rc.local ] && {
1686         . $CTDB_BASE/rc.local
1687 }
1688
1689 [ -d $CTDB_BASE/rc.local.d ] && {
1690         for i in $CTDB_BASE/rc.local.d/* ; do
1691                 [ -x "$i" ] && . "$i"
1692         done
1693 }
1694
1695 script_name="${0##*/}"       # basename
1696 service_fail_limit=1
1697 event_name="$1"