ctdb-scripts: Remove functions startstop_nfs() and startstop_nfslock()
[samba.git] / ctdb / config / functions
1 # Hey Emacs, this is a -*- shell-script -*- !!!
2
3 # utility functions for ctdb event scripts
4
5 [ -z "$CTDB_VARDIR" ] && {
6     if [ -d "/var/lib/ctdb" ] ; then
7         export CTDB_VARDIR="/var/lib/ctdb"
8     else
9         export CTDB_VARDIR="/var/ctdb"
10     fi
11 }
12 [ -z "$CTDB_ETCDIR" ] && {
13     export CTDB_ETCDIR="/etc"
14 }
15
16 #######################################
17 # pull in a system config file, if any
18 _loadconfig() {
19
20     if [ -z "$1" ] ; then
21         foo="${service_config:-${service_name}}"
22         if [ -n "$foo" ] ; then
23             loadconfig "$foo"
24             return
25         fi
26     fi
27
28     if [ "$1" != "ctdb" ] ; then
29         loadconfig "ctdb"
30     fi
31
32     if [ -z "$1" ] ; then
33         return
34     fi
35
36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
37         . $CTDB_ETCDIR/sysconfig/$1
38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
39         . $CTDB_ETCDIR/default/$1
40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
41         . $CTDB_BASE/sysconfig/$1
42     fi
43
44     if [ "$1" = "ctdb" ] ; then
45         _config="${CTDB_BASE}/ctdbd.conf"
46         if [ -r "$_config" ] ; then
47             . "$_config"
48         fi
49     fi
50 }
51
52 loadconfig () {
53     _loadconfig "$@"
54 }
55
56 ##############################################################
57
58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
59 # configuration file.
60 debug ()
61 {
62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
63         # If there are arguments then echo them.  Otherwise expect to
64         # use stdin, which allows us to pass lots of debug using a
65         # here document.
66         if [ -n "$1" ] ; then
67             echo "DEBUG: $*"
68         else
69             sed -e 's@^@DEBUG: @'
70         fi
71     else
72         if [ -z "$1" ] ; then
73             cat >/dev/null
74         fi
75     fi
76 }
77
78 die ()
79 {
80     _msg="$1"
81     _rc="${2:-1}"
82
83     echo "$_msg"
84     exit $_rc
85 }
86
87 # Log given message or stdin to either syslog or a CTDB log file
88 # $1 is the tag passed to logger if syslog is in use.
89 script_log ()
90 {
91     _tag="$1" ; shift
92
93     case "$CTDB_LOGGING" in
94         file:*|"")
95             if [ -n "$CTDB_LOGGING" ] ; then
96                 _file="${CTDB_LOGGING#file:}"
97             else
98                 _file="/var/log/log.ctdb"
99             fi
100             {
101                 if [ -n "$*" ] ; then
102                     echo "$*"
103                 else
104                     cat
105                 fi
106             } >>"$_file"
107             ;;
108         *)
109             # Handle all syslog:* variants here too.  There's no tool to do
110             # the lossy things, so just use logger.
111             logger -t "ctdbd: ${_tag}" $*
112             ;;
113     esac
114 }
115
116 # When things are run in the background in an eventscript then logging
117 # output might get lost.  This is the "solution".  :-)
118 background_with_logging ()
119 {
120     (
121         "$@" 2>&1 </dev/null |
122         script_log "${script_name}&"
123     )&
124
125     return 0
126 }
127
128 ##############################################################
129 # check number of args for different events
130 ctdb_check_args ()
131 {
132     case "$1" in
133         takeip|releaseip)
134             if [ $# != 4 ]; then
135                 echo "ERROR: must supply interface, IP and maskbits"
136                 exit 1
137             fi
138             ;;
139         updateip)
140             if [ $# != 5 ]; then
141                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
142                 exit 1
143             fi
144             ;;
145     esac
146 }
147
148 ##############################################################
149 # determine on what type of system (init style) we are running
150 detect_init_style()
151 {
152     # only do detection if not already set:
153     [ -z "$CTDB_INIT_STYLE" ] || return
154
155     if [ -x /sbin/startproc ]; then
156         CTDB_INIT_STYLE="suse"
157     elif [ -x /sbin/start-stop-daemon ]; then
158         CTDB_INIT_STYLE="debian"
159     else
160         CTDB_INIT_STYLE="redhat"
161     fi
162 }
163
164 ######################################################
165 # simulate /sbin/service on platforms that don't have it
166 # _service() makes it easier to hook the service() function for
167 # testing.
168 _service ()
169 {
170   _service_name="$1"
171   _op="$2"
172
173   # do nothing, when no service was specified
174   [ -z "$_service_name" ] && return
175
176   if [ -x /sbin/service ]; then
177       $_nice /sbin/service "$_service_name" "$_op"
178   elif [ -x /usr/sbin/service ]; then
179       $_nice /usr/sbin/service "$_service_name" "$_op"
180   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
181       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
182   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
183       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
184   fi
185 }
186
187 service()
188 {
189     _nice=""
190     _service "$@"
191 }
192
193 ######################################################
194 # simulate /sbin/service (niced) on platforms that don't have it
195 nice_service()
196 {
197     _nice="nice"
198     _service "$@"
199 }
200
201 ######################################################
202 # Cached retrieval of PNN from local node.  This never changes so why
203 # open a client connection to the server each time this is needed?
204 # This sets $pnn - this avoid an unnecessary subprocess.
205 ctdb_get_pnn ()
206 {
207     _pnn_file="$CTDB_VARDIR/state/my-pnn"
208     if [ ! -f "$_pnn_file" ] ; then
209         ctdb pnn | sed -e 's@.*:@@' >"$_pnn_file"
210     fi
211
212     read pnn <"$_pnn_file"
213 }
214
215 ######################################################
216 # wrapper around /proc/ settings to allow them to be hooked
217 # for testing
218 # 1st arg is relative path under /proc/, 2nd arg is value to set
219 set_proc ()
220 {
221     echo "$2" >"/proc/$1"
222 }
223
224 set_proc_maybe ()
225 {
226     if [ -w "/proc/$1" ] ; then
227         set_proc "$1" "$2"
228     fi
229 }
230
231 ######################################################
232 # wrapper around getting file contents from /proc/ to allow
233 # this to be hooked for testing
234 # 1st arg is relative path under /proc/
235 get_proc ()
236 {
237     cat "/proc/$1"
238 }
239
240 ######################################################
241 # Print up to $_max kernel stack traces for processes named $_program
242 program_stack_traces ()
243 {
244     _prog="$1"
245     _max="${2:-1}"
246
247     _count=1
248     for _pid in $(pidof "$_prog") ; do
249         [ $_count -le $_max ] || break
250
251         # Do this first to avoid racing with process exit
252         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
253         if [ -n "$_stack" ] ; then
254             echo "Stack trace for ${_prog}[${_pid}]:"
255             echo "$_stack"
256             _count=$(($_count + 1))
257         fi
258     done
259 }
260
261 ######################################################
262 # Check the health of NFS services
263 #
264 # Use .check files in given directory.
265 # Default is "${CTDB_BASE}/nfs-checks.d/"
266 ######################################################
267 nfs_check_services ()
268 {
269     _dir="${1:-${CTDB_NFS_CHECKS_DIR:-${CTDB_BASE}/nfs-checks.d}}"
270
271     # Files must end with .check - avoids editor backups, RPM fu, ...
272     for _f in "$_dir"/[0-9][0-9].*.check ; do
273         _t="${_f%.check}"
274         _progname="${_t##*/[0-9][0-9].}"
275
276         nfs_check_service "$_progname" <"$_f"
277     done
278 }
279
280 ######################################################
281 # Check the health of an NFS service
282 #
283 # $1 - progname, passed to rpcinfo (looked up in /etc/rpc)
284 #
285 # Reads variables from stdin
286 #
287 # Variables are:
288 #
289 # * family             - "tcp" or "udp" or space separated list
290 #                        default: tcp
291 # * version            - optional, RPC service version number
292 #                        default is to omit to check for any version
293 # * unhealthy_after    - number of check fails before unhealthy
294 #                        default: 1
295 # * restart_every      - number of check fails before restart
296 #                        default: 0, meaning no restart
297 # * service_stop_cmd   - command to stop service
298 #                        default: no default, must be provided if
299 #                                 restart_every > 0
300 # * service_start_cmd  - command to start service
301 #                        default: no default, must be provided if
302 #                                 restart_every > 0
303 # * service_debug_cmd  - command to debug a service after trying to stop it;
304 #                        for example, it can be useful to print stack
305 #                        traces of threads that have not exited, since
306 #                        they may be stuck doing I/O;
307 #                        no default, see also function program_stack_traces()
308 #
309 # Quoting in values is not preserved
310 #
311 ######################################################
312 nfs_check_service ()
313 {
314     _progname="$1"
315
316     (
317         # Subshell to restrict scope variables...
318
319         # Defaults
320         family="tcp"
321         version=""
322         unhealthy_after=1
323         restart_every=0
324         service_stop_cmd=""
325         service_start_cmd=""
326         service_debug_cmd=""
327
328         # Eval line-by-line.  Expands variable references in values.
329         # Also allows variable name checking, which seems useful.
330         while read _line ; do
331             case "$_line" in
332                 \#*|"") : ;; # Ignore comments, blank lines
333
334                 family=*|version=*|\
335                 unhealthy_after=*|restart_every=*|\
336                 service_stop_cmd=*|service_start_cmd=*|\
337                 service_debug_cmd=*)
338
339                     eval "$_line"
340                     ;;
341                 *)
342                     echo "ERROR: Unknown variable for ${_progname}: ${_line}"
343                     exit 1
344             esac
345         done
346
347         _service_name="nfs_${_progname}"
348
349         if nfs_check_rpcinfo \
350                "$_progname" "$version" "$family" >/dev/null ; then
351             if [ $unhealthy_after -ne 1 -o $restart_every -ne 0 ] ; then
352                 ctdb_counter_init "$_service_name"
353             fi
354             exit 0
355         fi
356
357         ctdb_counter_incr "$_service_name"
358         _failcount=$(ctdb_counter_get "$_service_name")
359
360         _unhealthy=false
361         if [ $unhealthy_after -gt 0 ] ; then
362             if [ $_failcount -ge $unhealthy_after ] ; then
363                 _unhealthy=true
364                 echo "ERROR: $ctdb_check_rpc_out"
365             fi
366         fi
367
368         if [ $restart_every -gt 0 ] ; then
369             if [ $(($_failcount % $restart_every)) -eq 0 ] ; then
370                 if ! $_unhealthy ; then
371                     echo "WARNING: $ctdb_check_rpc_out"
372                 fi
373                 nfs_restart_service
374             fi
375         fi
376
377         if $_unhealthy ; then
378             exit 1
379         fi
380
381         return 0
382     ) || exit 1
383 }
384
385 # Uses: stop_service, start_service, debug_stuck_threads
386 nfs_restart_service ()
387 {
388     if [ -z "$service_stop_cmd" -o -z "$service_start_cmd" ] ; then
389         die "ERROR: Can not restart service \"${_progname}\" without corresponding service_start_cmd/service_stop_cmd settings"
390     fi
391
392     echo "Trying to restart service \"${_progname}\"..."
393     # Using eval means variables can contain semicolon separated commands
394     eval "$service_stop_cmd"
395     if [ -n "$service_debug_cmd" ] ; then
396         eval "$service_debug_cmd"
397     fi
398     background_with_logging eval "$service_start_cmd"
399 }
400
401 ######################################################
402 # Check an RPC service with rpcinfo
403 ######################################################
404 ctdb_check_rpc ()
405 {
406     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
407     _version="$2"         # optional, not passed if empty/unset
408     _family="${3:-tcp}"   # optional, default is "tcp"
409
410     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
411
412     if ! ctdb_check_rpc_out=$(rpcinfo -T $_family $_localhost \
413                                       $_progname $_version 2>&1) ; then
414         ctdb_check_rpc_out="$_progname failed RPC check:
415 $ctdb_check_rpc_out"
416         echo "$ctdb_check_rpc_out"
417         return 1
418     fi
419 }
420
421 nfs_check_rpcinfo ()
422 {
423     _progname="$1"        # passed to rpcinfo (looked up in /etc/rpc)
424     _versions="$2"        # optional, space separated, not passed if empty/unset
425     _families="${3:-tcp}" # optional, space separated, default is "tcp"
426
427     for _family in $_families ; do
428         if [ -n "$_versions" ] ; then
429             for _version in $_versions ; do
430                 ctdb_check_rpc $_progname $_version $_family || return $?
431             done
432         else
433             ctdb_check_rpc $_progname "" $_family || return $?
434         fi
435     done
436 }
437
438 ######################################################
439 # Ensure $service_name is set
440 assert_service_name ()
441 {
442     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
443 }
444
445 ######################################################
446 # check a set of directories is available
447 # return 1 on a missing directory
448 # directories are read from stdin
449 ######################################################
450 ctdb_check_directories_probe()
451 {
452     while IFS="" read d ; do
453         case "$d" in
454             *%*)
455                 continue
456                 ;;
457             *)
458                 [ -d "${d}/." ] || return 1
459         esac
460     done
461 }
462
463 ######################################################
464 # check a set of directories is available
465 # directories are read from stdin
466 ######################################################
467 ctdb_check_directories()
468 {
469     ctdb_check_directories_probe || {
470         echo "ERROR: $service_name directory \"$d\" not available"
471         exit 1
472     }
473 }
474
475 ######################################################
476 # check a set of tcp ports
477 # usage: ctdb_check_tcp_ports <ports...>
478 ######################################################
479
480 # This flag file is created when a service is initially started.  It
481 # is deleted the first time TCP port checks for that service succeed.
482 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
483 # message if a port check fails.
484 _ctdb_check_tcp_common ()
485 {
486     assert_service_name
487     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
488 }
489
490 ctdb_check_tcp_init ()
491 {
492     _ctdb_check_tcp_common
493     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
494     touch "$_ctdb_service_started_file"
495 }
496
497 # Check whether something is listening on all of the given TCP ports
498 # using the "ctdb checktcpport" command.
499 ctdb_check_tcp_ports()
500 {
501     if [ -z "$1" ] ; then
502         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
503         exit 1
504     fi
505
506     for _p ; do  # process each function argument (port)
507         _cmd="ctdb checktcpport $_p"
508         _out=$($_cmd 2>&1)
509         _ret=$?
510         case "$_ret" in
511             0)
512                 _ctdb_check_tcp_common
513                 if [ ! -f "$_ctdb_service_started_file" ] ; then
514                     echo "ERROR: $service_name tcp port $_p is not responding"
515                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
516                 else
517                     echo "INFO: $service_name tcp port $_p is not responding"
518                 fi
519
520                 return 1
521                 ;;
522             98)
523                 # Couldn't bind, something already listening, next port...
524                 continue
525                 ;;
526             *)
527                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
528                 debug <<EOF
529 ctdb checktcpport (exited with $_ret) with output:
530 $_out"
531 EOF
532                 return $_ret
533         esac
534     done
535
536     # All ports listening
537     _ctdb_check_tcp_common
538     rm -f "$_ctdb_service_started_file"
539     return 0
540 }
541
542 ######################################################
543 # check a unix socket
544 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
545 ######################################################
546 ctdb_check_unix_socket() {
547     socket_path="$1"
548     [ -z "$socket_path" ] && return
549
550     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
551         echo "ERROR: $service_name socket $socket_path not found"
552         return 1
553     fi
554 }
555
556 ######################################################
557 # check a command returns zero status
558 # usage: ctdb_check_command <command>
559 ######################################################
560 ctdb_check_command ()
561 {
562     _out=$("$@" 2>&1) || {
563         echo "ERROR: $* returned error"
564         echo "$_out" | debug
565         exit 1
566     }
567 }
568
569 ################################################
570 # kill off any TCP connections with the given IP
571 ################################################
572 kill_tcp_connections ()
573 {
574     _ip="$1"
575
576     _oneway=false
577     if [ "$2" = "oneway" ] ; then
578         _oneway=true
579     fi
580
581     get_tcp_connections_for_ip "$_ip" | {
582         _killcount=0
583         _connections=""
584         _nl="
585 "
586         while read _dst _src; do
587             _destport="${_dst##*:}"
588             __oneway=$_oneway
589             case $_destport in
590                 # we only do one-way killtcp for CIFS
591                 139|445) __oneway=true ;;
592             esac
593
594             echo "Killing TCP connection $_src $_dst"
595             _connections="${_connections}${_nl}${_src} ${_dst}"
596             if ! $__oneway ; then
597                 _connections="${_connections}${_nl}${_dst} ${_src}"
598             fi
599
600             _killcount=$(($_killcount + 1))
601         done
602
603         if [ $_killcount -eq 0 ] ; then
604             return
605         fi
606
607         echo "$_connections" | ctdb killtcp || {
608             echo "Failed to send killtcp control"
609             return
610         }
611
612         _count=0
613         while : ; do
614             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
615
616             if [ $_remaining -eq 0 ] ; then
617                 echo "Killed $_killcount TCP connections to released IP $_ip"
618                 return
619             fi
620
621             _count=$(($_count + 1))
622             if [ $_count -gt 3 ] ; then
623                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
624                 return
625             fi
626
627             echo "Waiting for $_remaining connections to be killed for IP $_ip"
628             sleep 1
629         done
630     }
631 }
632
633 ##################################################################
634 # kill off the local end for any TCP connections with the given IP
635 ##################################################################
636 kill_tcp_connections_local_only ()
637 {
638     kill_tcp_connections "$1" "oneway"
639 }
640
641 ##################################################################
642 # tickle any TCP connections with the given IP
643 ##################################################################
644 tickle_tcp_connections ()
645 {
646     _ip="$1"
647
648     get_tcp_connections_for_ip "$_ip" |
649     {
650         _failed=false
651
652         while read dest src; do
653             echo "Tickle TCP connection $src $dest"
654             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
655             echo "Tickle TCP connection $dest $src"
656             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
657         done
658
659         if $_failed ; then
660             echo "Failed to send tickle control"
661         fi
662     }
663 }
664
665 get_tcp_connections_for_ip ()
666 {
667     _ip="$1"
668
669     netstat -tn | awk -v ip=$_ip \
670         'index($1, "tcp") == 1 && \
671          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
672          && $6 == "ESTABLISHED" \
673          {print $4" "$5}'
674 }
675
676 ##################################################################
677 # use statd-callout to update NFS lock info
678 ##################################################################
679 nfs_update_lock_info ()
680 {
681     if [ -x "$CTDB_BASE/statd-callout" ] ; then
682         "$CTDB_BASE/statd-callout" update
683     fi
684 }
685
686 ########################################################
687 # start/stop the Ganesha nfs service
688 ########################################################
689 startstop_ganesha()
690 {
691     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
692     case "$1" in
693         start)
694             service "$_service_name" start
695             ;;
696         stop)
697             service "$_service_name" stop
698             ;;
699         restart)
700             service "$_service_name" stop
701             nfs_dump_some_threads "rpc.statd"
702             service "$_service_name" start
703             ;;
704     esac
705 }
706
707 # Dump up to the configured number of nfsd thread backtraces.
708 nfs_dump_some_threads ()
709 {
710     _prog="${1:-nfsd}"
711
712     _num="${CTDB_NFS_DUMP_STUCK_THREADS:-5}"
713     [ $_num -gt 0 ] || return 0
714
715     program_stack_traces "$_prog" $_num
716 }
717
718 ########################################################
719
720 add_ip_to_iface ()
721 {
722     _iface=$1
723     _ip=$2
724     _maskbits=$3
725
726     # Ensure interface is up
727     ip link set "$_iface" up || \
728         die "Failed to bringup interface $_iface"
729
730     # Only need to define broadcast for IPv4
731     case "$ip" in
732         *:*) _bcast=""      ;;
733         *)   _bcast="brd +" ;;
734     esac
735
736     ip addr add "$_ip/$_maskbits" $_bcast dev "$_iface" || {
737         echo "Failed to add $_ip/$_maskbits on dev $_iface"
738         return 1
739     }
740
741     # Wait 5 seconds for IPv6 addresses to stop being tentative...
742     if [ -z "$_bcast" ] ; then
743         for _x in $(seq 1 10) ; do
744             ip addr show to "${_ip}/128" | grep -q "tentative" || break
745             sleep 0.5
746         done
747
748         # If the address was a duplicate then it won't be on the
749         # interface so flag an error.
750         _t=$(ip addr show to "${_ip}/128")
751         case "$_t" in
752             "")
753                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
754                 return 1
755                 ;;
756             *tentative*|*dadfailed*)
757                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
758                 ip addr del "$_ip/$_maskbits" dev "$_iface"
759                 return 1
760                 ;;
761         esac
762     fi
763 }
764
765 delete_ip_from_iface()
766 {
767     _iface=$1
768     _ip=$2
769     _maskbits=$3
770
771     # This could be set globally for all interfaces but it is probably
772     # better to avoid surprises, so limit it the interfaces where CTDB
773     # has public IP addresses.  There isn't anywhere else convenient
774     # to do this so just set it each time.  This is much cheaper than
775     # remembering and re-adding secondaries.
776     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
777
778     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
779         echo "Failed to del $_ip on dev $_iface"
780         return 1
781     }
782 }
783
784 # If the given IP is hosted then print 2 items: maskbits and iface
785 ip_maskbits_iface ()
786 {
787     _addr="$1"
788
789     case "$_addr" in
790         *:*) _family="inet6" ; _bits=128 ;;
791         *)   _family="inet"  ; _bits=32  ;;
792     esac
793
794     ip addr show to "${_addr}/${_bits}" 2>/dev/null | \
795         awk -v family="${_family}" \
796             'NR == 1 { iface = $2; sub(":$", "", iface) ; \
797                        sub("@.*", "", iface) } \
798              $1 ~ /inet/ { mask = $2; sub(".*/", "", mask); \
799                            print mask, iface, family }'
800 }
801
802 drop_ip ()
803 {
804     _addr="${1%/*}"  # Remove optional maskbits
805
806     set -- $(ip_maskbits_iface $_addr)
807     if [ -n "$1" ] ; then
808         _maskbits="$1"
809         _iface="$2"
810         echo "Removing public address $_addr/$_maskbits from device $_iface"
811         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
812     fi
813 }
814
815 drop_all_public_ips ()
816 {
817     while read _ip _x ; do
818         drop_ip "$_ip"
819     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
820 }
821
822 flush_route_cache ()
823 {
824     set_proc_maybe sys/net/ipv4/route/flush 1
825     set_proc_maybe sys/net/ipv6/route/flush 1
826 }
827
828 ########################################################
829 # Simple counters
830 _ctdb_counter_common () {
831     _service_name="${1:-${service_name:-${script_name}}}"
832     _counter_file="$ctdb_fail_dir/$_service_name"
833     mkdir -p "${_counter_file%/*}" # dirname
834 }
835 ctdb_counter_init () {
836     _ctdb_counter_common "$1"
837
838     >"$_counter_file"
839 }
840 ctdb_counter_incr () {
841     _ctdb_counter_common "$1"
842
843     # unary counting!
844     echo -n 1 >> "$_counter_file"
845 }
846 ctdb_counter_get () {
847     _ctdb_counter_common "$1"
848     # unary counting!
849     stat -c "%s" "$_counter_file" 2>/dev/null || echo 0
850 }
851 ctdb_check_counter () {
852     _msg="${1:-error}"  # "error"  - anything else is silent on fail
853     _op="${2:--ge}"  # an integer operator supported by test
854     _limit="${3:-${service_fail_limit}}"
855     shift 3
856
857     _size=$(ctdb_counter_get "$1")
858
859     _hit=false
860     if [ "$_op" != "%" ] ; then
861         if [ $_size $_op $_limit ] ; then
862             _hit=true
863         fi
864     else
865         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
866             _hit=true
867         fi
868     fi
869     if $_hit ; then
870         if [ "$_msg" = "error" ] ; then
871             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
872             exit 1              
873         else
874             return 1
875         fi
876     fi
877 }
878
879 ########################################################
880
881 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
882 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
883
884 ctdb_setup_service_state_dir ()
885 {
886     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
887     mkdir -p "$service_state_dir" || {
888         echo "Error creating state dir \"$service_state_dir\""
889         exit 1
890     }
891 }
892
893 ########################################################
894 # Managed status history, for auto-start/stop
895
896 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
897
898 _ctdb_managed_common ()
899 {
900     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
901 }
902
903 ctdb_service_managed ()
904 {
905     _ctdb_managed_common
906     mkdir -p "$ctdb_managed_dir"
907     touch "$_ctdb_managed_file"
908 }
909
910 ctdb_service_unmanaged ()
911 {
912     _ctdb_managed_common
913     rm -f "$_ctdb_managed_file"
914 }
915
916 is_ctdb_previously_managed_service ()
917 {
918     _ctdb_managed_common
919     [ -f "$_ctdb_managed_file" ]
920 }
921
922 ########################################################
923 # Check and set status
924
925 log_status_cat ()
926 {
927     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
928 }
929
930 ctdb_checkstatus ()
931 {
932     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
933         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
934         return 1
935     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
936         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
937         return 2
938     else
939         return 0
940     fi
941 }
942
943 ctdb_setstatus ()
944 {
945     d="$ctdb_status_dir/$script_name"
946     case "$1" in
947         unhealthy|banned)
948             mkdir -p "$d"
949             cat "$2" >"$d/$1"
950             ;;
951         *)
952             for i in "banned" "unhealthy" ; do
953                 rm -f "$d/$i"
954             done
955             ;;
956     esac
957 }
958
959 ##################################################################
960 # Reconfigure a service on demand
961
962 _ctdb_service_reconfigure_common ()
963 {
964     _d="$ctdb_status_dir/${service_name}"
965     mkdir -p "$_d"
966     _ctdb_service_reconfigure_flag="$_d/reconfigure"
967 }
968
969 ctdb_service_needs_reconfigure ()
970 {
971     _ctdb_service_reconfigure_common
972     [ -e "$_ctdb_service_reconfigure_flag" ]
973 }
974
975 ctdb_service_set_reconfigure ()
976 {
977     _ctdb_service_reconfigure_common
978     >"$_ctdb_service_reconfigure_flag"
979 }
980
981 ctdb_service_unset_reconfigure ()
982 {
983     _ctdb_service_reconfigure_common
984     rm -f "$_ctdb_service_reconfigure_flag"
985 }
986
987 ctdb_service_reconfigure ()
988 {
989     echo "Reconfiguring service \"${service_name}\"..."
990     ctdb_service_unset_reconfigure
991     service_reconfigure || return $?
992     ctdb_counter_init
993 }
994
995 # Default service_reconfigure() function does nothing.
996 service_reconfigure ()
997 {
998     :
999 }
1000
1001 ctdb_reconfigure_take_lock ()
1002 {
1003     _ctdb_service_reconfigure_common
1004     _lock="${_d}/reconfigure_lock"
1005     mkdir -p "${_lock%/*}" # dirname
1006     touch "$_lock"
1007
1008     (
1009         flock 0
1010         # This is overkill but will work if we need to extend this to
1011         # allow certain events to run multiple times in parallel
1012         # (e.g. takeip) and write multiple PIDs to the file.
1013         read _locker_event 
1014         if [ -n "$_locker_event" ] ; then
1015             while read _pid ; do
1016                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1017                     kill -0 "$_pid" 2>/dev/null ; then
1018                     exit 1
1019                 fi
1020             done
1021         fi
1022
1023         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1024         exit 0
1025     ) <"$_lock"
1026 }
1027
1028 ctdb_reconfigure_release_lock ()
1029 {
1030     _ctdb_service_reconfigure_common
1031     _lock="${_d}/reconfigure_lock"
1032
1033     rm -f "$_lock"
1034 }
1035
1036 ctdb_replay_monitor_status ()
1037 {
1038     echo "Replaying previous status for this script due to reconfigure..."
1039     # Leading separator ('|') is missing in some versions...
1040     _out=$(ctdb scriptstatus -X | grep -E "^\|?monitor\|${script_name}\|")
1041     # Output looks like this:
1042     # |monitor|60.nfs|1|ERROR|1314764004.030861|1314764004.035514|foo bar|
1043     # This is the cheapest way of getting fields in the middle.
1044     set -- $(IFS="|" ; echo $_out)
1045     _code="$3"
1046     _status="$4"
1047     # The error output field can include colons so we'll try to
1048     # preserve them.  The weak checking at the beginning tries to make
1049     # this work for both broken (no leading '|') and fixed output.
1050     _out="${_out%|}"
1051     _err_out="${_out#*monitor|${script_name}|*|*|*|*|}"
1052     case "$_status" in
1053         OK) : ;;  # Do nothing special.
1054         TIMEDOUT)
1055             # Recast this as an error, since we can't exit with the
1056             # correct negative number.
1057             _code=1
1058             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1059             ;;
1060         DISABLED)
1061             # Recast this as an OK, since we can't exit with the
1062             # correct negative number.
1063             _code=0
1064             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1065             ;;
1066         *) : ;;  # Must be ERROR, do nothing special.
1067     esac
1068     if [ -n "$_err_out" ] ; then
1069         echo "$_err_out"
1070     fi
1071     exit $_code
1072 }
1073
1074 ctdb_service_check_reconfigure ()
1075 {
1076     assert_service_name
1077
1078     # We only care about some events in this function.  For others we
1079     # return now.
1080     case "$event_name" in
1081         monitor|ipreallocated|reconfigure) : ;;
1082         *) return 0 ;;
1083     esac
1084
1085     if ctdb_reconfigure_take_lock ; then
1086         # No events covered by this function are running, so proceed
1087         # with gay abandon.
1088         case "$event_name" in
1089             reconfigure)
1090                 (ctdb_service_reconfigure)
1091                 exit $?
1092                 ;;
1093             ipreallocated)
1094                 if ctdb_service_needs_reconfigure ; then
1095                     ctdb_service_reconfigure
1096                 fi
1097                 ;;
1098         esac
1099
1100         ctdb_reconfigure_release_lock
1101     else
1102         # Somebody else is running an event we don't want to collide
1103         # with.  We proceed with caution.
1104         case "$event_name" in
1105             reconfigure)
1106                 # Tell whoever called us to retry.
1107                 exit 2
1108                 ;;
1109             ipreallocated)
1110                 # Defer any scheduled reconfigure and just run the
1111                 # rest of the ipreallocated event, as per the
1112                 # eventscript.  There's an assumption here that the
1113                 # event doesn't depend on any scheduled reconfigure.
1114                 # This is true in the current code.
1115                 return 0
1116                 ;;
1117             monitor)
1118                 # There is most likely a reconfigure in progress so
1119                 # the service is possibly unstable.  As above, we
1120                 # defer any scheduled reconfigured.  We also replay
1121                 # the previous monitor status since that's the best
1122                 # information we have.
1123                 ctdb_replay_monitor_status
1124                 ;;
1125         esac
1126     fi
1127 }
1128
1129 ##################################################################
1130 # Does CTDB manage this service? - and associated auto-start/stop
1131
1132 ctdb_compat_managed_service ()
1133 {
1134     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1135         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1136     fi
1137 }
1138
1139 is_ctdb_managed_service ()
1140 {
1141     assert_service_name
1142
1143     # $t is used just for readability and to allow better accurate
1144     # matching via leading/trailing spaces
1145     t=" $CTDB_MANAGED_SERVICES "
1146
1147     # Return 0 if "<space>$service_name<space>" appears in $t
1148     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1149         return 0
1150     fi
1151
1152     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1153     # backward compatibility and try again.
1154     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1155     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1156     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1157     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1158     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1159     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1160     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1161     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1162     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1163
1164     t=" $CTDB_MANAGED_SERVICES "
1165
1166     # Return 0 if "<space>$service_name<space>" appears in $t
1167     [ "${t#* ${service_name} }" != "${t}" ]
1168 }
1169
1170 ctdb_start_stop_service ()
1171 {
1172     assert_service_name
1173
1174     # Allow service-start/service-stop pseudo-events to start/stop
1175     # services when we're not auto-starting/stopping and we're not
1176     # monitoring.
1177     case "$event_name" in
1178         service-start)
1179             if is_ctdb_managed_service ; then
1180                 die 'service-start event not permitted when service is managed'
1181             fi
1182             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1183                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1184             fi
1185             ctdb_service_start
1186             exit $?
1187             ;;
1188         service-stop)
1189             if is_ctdb_managed_service ; then
1190                 die 'service-stop event not permitted when service is managed'
1191             fi
1192             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1193                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1194             fi
1195             ctdb_service_stop
1196             exit $?
1197             ;;
1198     esac
1199
1200     # Do nothing unless configured to...
1201     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1202
1203     [ "$event_name" = "monitor" ] || return 0
1204
1205     if is_ctdb_managed_service ; then
1206         if ! is_ctdb_previously_managed_service ; then
1207             echo "Starting service \"$service_name\" - now managed"
1208             background_with_logging ctdb_service_start
1209             exit $?
1210         fi
1211     else
1212         if is_ctdb_previously_managed_service ; then
1213             echo "Stopping service \"$service_name\" - no longer managed"
1214             background_with_logging ctdb_service_stop
1215             exit $?
1216         fi
1217     fi
1218 }
1219
1220 ctdb_service_start ()
1221 {
1222     # The service is marked managed if we've ever tried to start it.
1223     ctdb_service_managed
1224
1225     service_start || return $?
1226
1227     ctdb_counter_init
1228     ctdb_check_tcp_init
1229 }
1230
1231 ctdb_service_stop ()
1232 {
1233     ctdb_service_unmanaged
1234     service_stop
1235 }
1236
1237 # Default service_start() and service_stop() functions.
1238  
1239 # These may be overridden in an eventscript.
1240 service_start ()
1241 {
1242     service "$service_name" start
1243 }
1244
1245 service_stop ()
1246 {
1247     service "$service_name" stop
1248 }
1249
1250 ##################################################################
1251
1252 ctdb_standard_event_handler ()
1253 {
1254     case "$1" in
1255         status)
1256             ctdb_checkstatus
1257             exit
1258             ;;
1259         setstatus)
1260             shift
1261             ctdb_setstatus "$@"
1262             exit
1263             ;;
1264     esac
1265 }
1266
1267 iptables_wrapper ()
1268 {
1269     _family="$1" ; shift
1270     if [ "$_family" = "inet6" ] ; then
1271         _iptables_cmd="ip6tables"
1272     else
1273         _iptables_cmd="iptables"
1274     fi
1275
1276     # iptables doesn't like being re-entered, so flock-wrap it.
1277     flock -w 30 "${CTDB_VARDIR}/iptables-ctdb.flock" "$_iptables_cmd" "$@"
1278 }
1279
1280 # AIX (and perhaps others?) doesn't have mktemp
1281 if ! type mktemp >/dev/null 2>&1 ; then
1282     mktemp ()
1283     {
1284         _dir=false
1285         if [ "$1" = "-d" ] ; then
1286             _dir=true
1287             shift
1288         fi
1289         _d="${TMPDIR:-/tmp}"
1290         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1291             md5sum | \
1292             sed -e 's@\(..........\).*@\1@')
1293         _t="${_d}/tmp.${_hex10}"
1294         (
1295             umask 077
1296             if $_dir ; then
1297                 mkdir "$_t"
1298             else
1299                 >"$_t"
1300             fi
1301         )
1302         echo "$_t"
1303     }
1304 fi
1305
1306 ########################################################
1307 # tickle handling
1308 ########################################################
1309
1310 update_tickles ()
1311 {
1312         _port="$1"
1313
1314         tickledir="$CTDB_VARDIR/state/tickles"
1315         mkdir -p "$tickledir"
1316
1317         ctdb_get_pnn
1318
1319         # What public IPs do I hold?
1320         _ips=$(ctdb -X ip | awk -F'|' -v pnn=$pnn '$3 == pnn {print $2}')
1321
1322         # IPs as a regexp choice
1323         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1324
1325         # Record connections to our public IPs in a temporary file
1326         _my_connections="${tickledir}/${_port}.connections"
1327         rm -f "$_my_connections"
1328         netstat -tn |
1329         awk -v destpat="^${_ipschoice}:${_port}\$" \
1330           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1331         sort >"$_my_connections"
1332
1333         # Record our current tickles in a temporary file
1334         _my_tickles="${tickledir}/${_port}.tickles"
1335         rm -f "$_my_tickles"
1336         for _i in $_ips ; do
1337                 ctdb -X gettickles $_i $_port |
1338                 awk -F'|' 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1339         done |
1340         sort >"$_my_tickles"
1341
1342         # Add tickles for connections that we haven't already got tickles for
1343         comm -23 "$_my_connections" "$_my_tickles" |
1344         while read _src _dst ; do
1345                 ctdb addtickle $_src $_dst
1346         done
1347
1348         # Remove tickles for connections that are no longer there
1349         comm -13 "$_my_connections" "$_my_tickles" |
1350         while read _src _dst ; do
1351                 ctdb deltickle $_src $_dst
1352         done
1353
1354         rm -f "$_my_connections" "$_my_tickles" 
1355 }
1356
1357 ########################################################
1358 # load a site local config file
1359 ########################################################
1360
1361 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1362         . "$CTDB_RC_LOCAL"
1363 }
1364
1365 [ -x $CTDB_BASE/rc.local ] && {
1366         . $CTDB_BASE/rc.local
1367 }
1368
1369 [ -d $CTDB_BASE/rc.local.d ] && {
1370         for i in $CTDB_BASE/rc.local.d/* ; do
1371                 [ -x "$i" ] && . "$i"
1372         done
1373 }
1374
1375 script_name="${0##*/}"       # basename
1376 service_fail_limit=1
1377 event_name="$1"