eventscripts: Remove unnecessary variables from killtcp/tickle functions
[obnox/samba/samba-obnox.git] / ctdb / config / functions
1 # Hey Emacs, this is a -*- shell-script -*- !!!
2
3 # utility functions for ctdb event scripts
4
5 PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
6
7 [ -z "$CTDB_VARDIR" ] && {
8     if [ -d "/var/lib/ctdb" ] ; then
9         export CTDB_VARDIR="/var/lib/ctdb"
10     else
11         export CTDB_VARDIR="/var/ctdb"
12     fi
13 }
14 [ -z "$CTDB_ETCDIR" ] && {
15     export CTDB_ETCDIR="/etc"
16 }
17
18 #######################################
19 # pull in a system config file, if any
20 _loadconfig() {
21
22     if [ -z "$1" ] ; then
23         foo="${service_config:-${service_name}}"
24         if [ -n "$foo" ] ; then
25             loadconfig "$foo"
26         fi
27     elif [ "$1" != "ctdb" ] ; then
28         loadconfig "ctdb"
29     fi
30
31     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
32         . $CTDB_ETCDIR/sysconfig/$1
33     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
34         . $CTDB_ETCDIR/default/$1
35     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
36         . $CTDB_BASE/sysconfig/$1
37     fi
38 }
39
40 loadconfig () {
41     _loadconfig "$@"
42 }
43
44 ##############################################################
45
46 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
47 # configuration file.
48 debug ()
49 {
50     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
51         # If there are arguments then echo them.  Otherwise expect to
52         # use stdin, which allows us to pass lots of debug using a
53         # here document.
54         if [ -n "$1" ] ; then
55             echo "DEBUG: $*"
56         elif ! tty -s ; then
57             sed -e 's@^@DEBUG: @'
58         fi
59     fi
60 }
61
62 die ()
63 {
64     _msg="$1"
65     _rc="${2:-1}"
66
67     echo "$_msg"
68     exit $_rc
69 }
70
71 # Log given message or stdin to either syslog or a CTDB log file
72 # $1 is the tag passed to logger if syslog is in use.
73 script_log ()
74 {
75     _tag="$1" ; shift
76
77     _using_syslog=false
78     if [ "$CTDB_SYSLOG" = "yes" -o -z "$CTDB_LOGFILE" ] ; then
79         _using_syslog=true
80     fi
81     case "$CTDB_OPTIONS" in
82         *--syslog*) _using_syslog=true ;;
83     esac
84
85     if $_using_syslog ; then
86         logger -t "ctdbd: ${_tag}" $*
87     else
88         {
89             if [ -n "$*" ] ; then
90                 echo "$*"
91             else
92                 cat
93             fi
94         } >>"${CTDB_LOGFILE:-/var/log/log.ctdb}"
95     fi
96 }
97
98 # When things are run in the background in an eventscript then logging
99 # output might get lost.  This is the "solution".  :-)
100 background_with_logging ()
101 {
102     (
103         "$@" 2>&1 </dev/null |
104         script_log "${script_name}&"
105     )&
106
107     return 0
108 }
109
110 ##############################################################
111 # check number of args for different events
112 ctdb_check_args ()
113 {
114     case "$1" in
115         takeip|releaseip)
116             if [ $# != 4 ]; then
117                 echo "ERROR: must supply interface, IP and maskbits"
118                 exit 1
119             fi
120             ;;
121         updateip)
122             if [ $# != 5 ]; then
123                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
124                 exit 1
125             fi
126             ;;
127     esac
128 }
129
130 ##############################################################
131 # determine on what type of system (init style) we are running
132 detect_init_style() {
133     # only do detection if not already set:
134     test "x$CTDB_INIT_STYLE" != "x" && return
135
136     if [ -x /sbin/startproc ]; then
137         CTDB_INIT_STYLE="suse"
138     elif [ -x /sbin/start-stop-daemon ]; then
139         CTDB_INIT_STYLE="debian"
140     else
141         CTDB_INIT_STYLE="redhat"
142     fi
143 }
144
145 ######################################################
146 # simulate /sbin/service on platforms that don't have it
147 # _service() makes it easier to hook the service() function for
148 # testing.
149 _service ()
150 {
151   _service_name="$1"
152   _op="$2"
153
154   # do nothing, when no service was specified
155   [ -z "$_service_name" ] && return
156
157   if [ -x /sbin/service ]; then
158       $_nice /sbin/service "$_service_name" "$_op"
159   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
160       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
161   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
162       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
163   fi
164 }
165
166 service()
167 {
168     _nice=""
169     _service "$@"
170 }
171
172 ######################################################
173 # simulate /sbin/service (niced) on platforms that don't have it
174 nice_service()
175 {
176     _nice="nice"
177     _service "$@"
178 }
179
180 ######################################################
181 # wrapper around /proc/ settings to allow them to be hooked
182 # for testing
183 # 1st arg is relative path under /proc/, 2nd arg is value to set
184 set_proc ()
185 {
186     echo "$2" >"/proc/$1"
187 }
188
189 ######################################################
190 # wrapper around getting file contents from /proc/ to allow
191 # this to be hooked for testing
192 # 1st arg is relative path under /proc/
193 get_proc ()
194 {
195     cat "/proc/$1"
196 }
197
198 ######################################################
199 # Check that an RPC service is healthy -
200 # this includes allowing a certain number of failures
201 # before marking the NFS service unhealthy.
202 #
203 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
204 #
205 # each triple is a set of 3 arguments: an operator, a 
206 # fail count limit and an action string.
207 #
208 # For example:
209 #
210 #       nfs_check_rpc_service "lockd" \
211 #           -ge 15 "verbose restart unhealthy" \
212 #           -eq 10 "restart:bs"
213 #
214 # says that if lockd is down for 15 iterations then do
215 # a verbose restart of lockd and mark the node unhealthy.
216 # Before this, after 10 iterations of failure, the
217 # service is restarted silently in the background.
218 # Order is important: the number of failures need to be
219 # specified in reverse order because processing stops
220 # after the first condition that is true.
221 ######################################################
222 nfs_check_rpc_service ()
223 {
224     _prog_name="$1" ; shift
225
226     _v=""
227     case "$1" in
228         -*) : ;;
229         *) _v="$1" ; shift ;;
230     esac
231
232     _version=${_v:-1}
233     _rpc_prog="$_prog_name"
234     _restart=""
235     _opts=""
236     case "$_prog_name" in
237         knfsd)
238             _rpc_prog=nfs
239             _version=${_v:-3}
240             _restart="echo 'Trying to restart NFS service'"
241             _restart="${_restart}; startstop_nfs restart"
242             ;;
243         ganesha)
244             _rpc_prog=nfs
245             _version=${_v:-3}
246             _restart="echo 'Trying to restart Ganesha NFS service'"
247             _restart="${_restart}; startstop_ganesha restart"
248             ;;
249         mountd)
250             _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
251             ;;
252         rquotad)
253             _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
254             ;;
255         lockd)
256             _rpc_prog=nlockmgr
257             _version=${_v:-4}
258             _restart="echo 'Trying to restart lock manager service'"
259             _restart="${_restart}; startstop_nfslock restart"
260             ;;
261         statd)
262             _rpc_prog=status
263             _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
264             _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
265             _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
266             ;;
267         *)
268             echo "Internal error: unknown RPC program \"$_prog_name\"."
269             exit 1
270     esac
271
272     _service_name="nfs_${_prog_name}"
273
274     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
275         ctdb_counter_init "$_service_name"
276         return 0
277     fi
278
279     ctdb_counter_incr "$_service_name"
280
281     while [ -n "$3" ] ; do
282         ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
283             for _action in $3 ; do
284                 case "$_action" in
285                     verbose)
286                         echo "$ctdb_check_rpc_out"
287                         ;;
288                     restart|restart:*)
289                         # No explicit command specified, construct rpc command.
290                         if [ -z "$_restart" ] ; then
291                             _p="rpc.${_prog_name}"
292                             _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
293                             _restart="${_restart}; killall -q -9 $_p"
294                             _restart="${_restart}; $_p $_opts"
295                         fi
296
297                         # Process restart flags...
298                         _flags="${_action#restart:}"
299                         # There may not have been a colon...
300                         [ "$_flags" != "$_action" ] || _flags=""
301                         # q=quiet - everything to /dev/null
302                         if [ "${_flags#*q}" != "$_flags" ] ; then
303                             _restart="{ ${_restart} ; } >/dev/null 2>&1"
304                         fi
305                         # s=stealthy - last command to /dev/null
306                         if [ "${_flags#*s}" != "$_flags" ] ; then
307                             _restart="${_restart} >/dev/null 2>&1"
308                         fi
309                         # b=background - the whole thing, easy and reliable
310                         if [ "${_flags#*b}" != "$_flags" ] ; then
311                             _restart="{ ${_restart} ; } &"
312                         fi
313
314                         # Do it!
315                         eval "${_restart}"
316                         ;;
317                     unhealthy)
318                         exit 1
319                         ;;
320                     *)
321                         echo "Internal error: unknown action \"$_action\"."
322                         exit 1
323                 esac
324             done
325
326             # Only process the first action group.
327             break
328         }
329         shift 3
330     done
331 }
332
333 ######################################################
334 # check that a rpc server is registered with portmap
335 # and responding to requests
336 # usage: ctdb_check_rpc SERVICE_NAME VERSION
337 ######################################################
338 ctdb_check_rpc ()
339 {
340     progname="$1"
341     version="$2"
342
343     if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
344         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
345 $ctdb_check_rpc_out"
346         echo "$ctdb_check_rpc_out"
347         return 1
348     fi
349 }
350
351 ######################################################
352 # Ensure $service_name is set
353 assert_service_name ()
354 {
355     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
356 }
357
358 ######################################################
359 # check a set of directories is available
360 # return 1 on a missing directory
361 # directories are read from stdin
362 ######################################################
363 ctdb_check_directories_probe()
364 {
365     while IFS="" read d ; do
366         case "$d" in
367             *%*)
368                 continue
369                 ;;
370             *)
371                 [ -d "${d}/." ] || return 1
372         esac
373     done
374 }
375
376 ######################################################
377 # check a set of directories is available
378 # directories are read from stdin
379 ######################################################
380 ctdb_check_directories()
381 {
382     ctdb_check_directories_probe || {
383         echo "ERROR: $service_name directory \"$d\" not available"
384         exit 1
385     }
386 }
387
388 ######################################################
389 # check a set of tcp ports
390 # usage: ctdb_check_tcp_ports <ports...>
391 ######################################################
392
393 # This flag file is created when a service is initially started.  It
394 # is deleted the first time TCP port checks for that service succeed.
395 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
396 # message if a port check fails.
397 _ctdb_check_tcp_common ()
398 {
399     assert_service_name
400     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
401 }
402
403 ctdb_check_tcp_init ()
404 {
405     _ctdb_check_tcp_common
406     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
407     touch "$_ctdb_service_started_file"
408 }
409
410 ctdb_check_tcp_ports()
411 {
412     if [ -z "$1" ] ; then
413         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
414         exit 1
415     fi
416
417     # Set default value for CTDB_TCP_PORT_CHECKS if unset.
418     # If any of these defaults are unsupported then this variable can
419     # be overridden in /etc/sysconfig/ctdb or via a file in
420     # /etc/ctdb/rc.local.d/.
421     : ${CTDB_TCP_PORT_CHECKERS:=ctdb nmap netstat}
422
423     for _c in $CTDB_TCP_PORT_CHECKERS ; do
424         ctdb_check_tcp_ports_$_c "$@"
425         case "$?" in
426             0)
427                 _ctdb_check_tcp_common
428                 rm -f "$_ctdb_service_started_file"
429                 return 0
430                 ;;
431             1)
432                 _ctdb_check_tcp_common
433                 if [ ! -f "$_ctdb_service_started_file" ] ; then
434                     echo "ERROR: $service_name tcp port $_p is not responding"
435                     debug <<EOF
436 $ctdb_check_tcp_ports_debug
437 EOF
438                 else
439                     echo "INFO: $service_name tcp port $_p is not responding"
440                 fi
441
442                 return 1
443                 ;;
444             127)
445                 debug <<EOF
446 ctdb_check_ports - checker $_c not implemented
447 output from checker was:
448 $ctdb_check_tcp_ports_debug
449 EOF
450                 ;;
451             *)
452                 
453         esac
454     done
455
456     echo "INTERNAL ERROR: ctdb_check_ports - no working checkers in CTDB_TCP_PORT_CHECKERS=\"$CTDB_TCP_PORT_CHECKERS\""
457
458     return 127
459 }
460
461 ctdb_check_tcp_ports_netstat ()
462 {
463     _cmd='netstat -l -t -n'
464     _ns=$($_cmd 2>&1)
465     if [ $? -eq 127 ] ; then
466         # netstat probably not installed - unlikely?
467         ctdb_check_tcp_ports_debug="$_ns"
468         return 127
469     fi
470
471     for _p ; do  # process each function argument (port)
472         for _a in '0\.0\.0\.0' '::' ; do
473             _pat="[[:space:]]${_a}:${_p}[[:space:]]+[^[:space:]]+[[:space:]]+LISTEN"
474             if echo "$_ns" | grep -E -q "$_pat" ; then
475                 # We matched the port, so process next port
476                 continue 2
477             fi
478         done
479
480         # We didn't match the port, so flag an error.
481         ctdb_check_tcp_ports_debug="$_cmd shows this output:
482 $_ns"
483         return 1
484     done
485
486     return 0
487 }
488
489 ctdb_check_tcp_ports_nmap ()
490 {
491     # nmap wants a comma-separated list of ports
492     _ports=""
493     for _p ; do
494         _ports="${_ports}${_ports:+,}${_p}"
495     done
496
497     _cmd="nmap -n -oG - -PS 127.0.0.1 -p $_ports"
498
499     _nmap_out=$($_cmd 2>&1)
500     if [ $? -eq 127 ] ; then
501         # nmap probably not installed
502         ctdb_check_tcp_ports_debug="$_nmap_out"
503         return 127
504     fi
505
506     # get the port-related output
507     _port_info=$(echo "$_nmap_out" | sed -n -r -e 's@^.*Ports:[[:space:]]@@p')
508
509     for _p ; do
510         # looking for something like this:
511         #  445/open/tcp//microsoft-ds///
512         # possibly followed by a comma
513         _t="$_p/open/tcp//"
514         case "$_port_info" in
515             # The info we're after must be either at the beginning of
516             # the string or it must follow a space.
517             $_t*|*\ $_t*) : ;;
518             *)
519                 # Nope, flag an error...
520                 ctdb_check_tcp_ports_debug="$_cmd shows this output:
521 $_nmap_out"
522                 return 1
523         esac
524     done
525
526     return 0
527 }
528
529 # Use the new "ctdb checktcpport" command to check the port.
530 # This is very cheap.
531 ctdb_check_tcp_ports_ctdb ()
532 {
533     for _p ; do  # process each function argument (port)
534         _cmd="ctdb checktcpport $_p"
535         _out=$($_cmd 2>&1)
536         _ret=$?
537         case "$_ret" in
538             0)
539                 ctdb_check_tcp_ports_debug="\"$_cmd\" was able to bind to port"
540                 return 1
541                 ;;
542             98)
543                 # Couldn't bind, something already listening, next port...
544                 continue
545                 ;;
546             *)
547                 ctdb_check_tcp_ports_debug="$_cmd (exited with $_ret) with output:
548 $_out"
549                 # assume not implemented
550                 return 127
551         esac
552     done
553
554     return 0
555 }
556
557 ######################################################
558 # check a unix socket
559 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
560 ######################################################
561 ctdb_check_unix_socket() {
562     socket_path="$1"
563     [ -z "$socket_path" ] && return
564
565     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
566         echo "ERROR: $service_name socket $socket_path not found"
567         return 1
568     fi
569 }
570
571 ######################################################
572 # check a command returns zero status
573 # usage: ctdb_check_command <command>
574 ######################################################
575 ctdb_check_command ()
576 {
577     _out=$("$@" 2>&1) || {
578         echo "ERROR: $* returned error"
579         echo "$_out" | debug
580         exit 1
581     }
582 }
583
584 ################################################
585 # kill off any TCP connections with the given IP
586 ################################################
587 kill_tcp_connections() {
588     _IP="$1"    
589     _failed=0
590
591     _killcount=0
592     connfile="$CTDB_VARDIR/state/connections.$_IP"
593     mkdir -p "${connfile%/*}" # dirname
594     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
595     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
596
597     while read dest src; do
598         echo "Killing TCP connection $src $dest"
599         ctdb killtcp $src $dest >/dev/null 2>&1 || _failed=1
600         _destport="${dest##*:}"
601         case $_destport in
602           # we only do one-way killtcp for CIFS
603           139|445) : ;;
604           # for all others we do 2-way
605           *) 
606                 ctdb killtcp $dest $src >/dev/null 2>&1 || _failed=1
607                 ;;
608         esac
609         _killcount=`expr $_killcount + 1`
610      done < $connfile
611     rm -f $connfile
612
613     [ $_failed = 0 ] || {
614         echo "Failed to send killtcp control"
615         return;
616     }
617     [ $_killcount -gt 0 ] || {
618         return;
619     }
620     _count=0
621     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
622         sleep 1
623         _count=`expr $_count + 1`
624         [ $_count -gt 3 ] && {
625             echo "Timed out killing tcp connections for IP $_IP"
626             return;
627         }
628     done
629     echo "killed $_killcount TCP connections to released IP $_IP"
630 }
631
632 ##################################################################
633 # kill off the local end for any TCP connections with the given IP
634 ##################################################################
635 kill_tcp_connections_local_only() {
636     _IP="$1"    
637     _failed=0
638
639     _killcount=0
640     connfile="$CTDB_VARDIR/state/connections.$_IP"
641     mkdir -p "${connfile%/*}" # dirname
642     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
643     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
644
645     while read dest src; do
646         echo "Killing TCP connection $src $dest"
647         ctdb killtcp $src $dest >/dev/null 2>&1 || _failed=1
648         _killcount=`expr $_killcount + 1`
649      done < $connfile
650     rm -f $connfile
651
652     [ $_failed = 0 ] || {
653         echo "Failed to send killtcp control"
654         return;
655     }
656     [ $_killcount -gt 0 ] || {
657         return;
658     }
659     _count=0
660     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
661         sleep 1
662         _count=`expr $_count + 1`
663         [ $_count -gt 3 ] && {
664             echo "Timed out killing tcp connections for IP $_IP"
665             return;
666         }
667     done
668     echo "killed $_killcount TCP connections to released IP $_IP"
669 }
670
671 ##################################################################
672 # tickle any TCP connections with the given IP
673 ##################################################################
674 tickle_tcp_connections() {
675     _IP="$1"
676     _failed=0
677
678     _killcount=0
679     connfile="$CTDB_VARDIR/state/connections.$_IP"
680     mkdir -p "${connfile%/*}" # dirname
681     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
682     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
683
684     while read dest src; do
685         echo "Tickle TCP connection $src $dest"
686         ctdb tickle $src $dest >/dev/null 2>&1 || _failed=1
687         echo "Tickle TCP connection $dest $src"
688         ctdb tickle $dest $src >/dev/null 2>&1 || _failed=1
689      done < $connfile
690     rm -f $connfile
691
692     [ $_failed = 0 ] || {
693         echo "Failed to send tickle control"
694         return;
695     }
696 }
697
698 ########################################################
699 # start/stop the Ganesha nfs service
700 ########################################################
701 startstop_ganesha()
702 {
703     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
704     case "$1" in
705         start)
706             service "$_service_name" start
707             ;;
708         stop)
709             service "$_service_name" stop
710             ;;
711         restart)
712             service "$_service_name" restart
713             ;;
714     esac
715 }
716
717 ########################################################
718 # start/stop the nfs service on different platforms
719 ########################################################
720 startstop_nfs() {
721         PLATFORM="unknown"
722         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
723                 PLATFORM="sles"
724         }
725         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
726                 PLATFORM="rhel"
727         }
728
729         case $PLATFORM in
730         sles)
731                 case $1 in
732                 start)
733                         service nfsserver start
734                         ;;
735                 stop)
736                         service nfsserver stop > /dev/null 2>&1
737                         ;;
738                 restart)
739                         set_proc "fs/nfsd/threads" 0
740                         service nfsserver stop > /dev/null 2>&1
741                         pkill -9 nfsd
742                         service nfsserver start
743                         ;;
744                 esac
745                 ;;
746         rhel)
747                 case $1 in
748                 start)
749                         service nfslock start
750                         service nfs start
751                         ;;
752                 stop)
753                         service nfs stop
754                         service nfslock stop
755                         ;;
756                 restart)
757                         set_proc "fs/nfsd/threads" 0
758                         service nfs stop > /dev/null 2>&1
759                         service nfslock stop > /dev/null 2>&1
760                         pkill -9 nfsd
761                         service nfslock start
762                         service nfs start
763                         ;;
764                 esac
765                 ;;
766         *)
767                 echo "Unknown platform. NFS is not supported with ctdb"
768                 exit 1
769                 ;;
770         esac
771 }
772
773 ########################################################
774 # start/stop the nfs lockmanager service on different platforms
775 ########################################################
776 startstop_nfslock() {
777         PLATFORM="unknown"
778         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
779                 PLATFORM="sles"
780         }
781         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
782                 PLATFORM="rhel"
783         }
784
785         case $PLATFORM in
786         sles)
787                 # for sles there is no service for lockmanager
788                 # so we instead just shutdown/restart nfs
789                 case $1 in
790                 start)
791                         service nfsserver start
792                         ;;
793                 stop)
794                         service nfsserver stop > /dev/null 2>&1
795                         ;;
796                 restart)
797                         service nfsserver stop
798                         service nfsserver start
799                         ;;
800                 esac
801                 ;;
802         rhel)
803                 case $1 in
804                 start)
805                         service nfslock start
806                         ;;
807                 stop)
808                         service nfslock stop > /dev/null 2>&1
809                         ;;
810                 restart)
811                         service nfslock stop
812                         service nfslock start
813                         ;;
814                 esac
815                 ;;
816         *)
817                 echo "Unknown platform. NFS locking is not supported with ctdb"
818                 exit 1
819                 ;;
820         esac
821 }
822
823 add_ip_to_iface()
824 {
825     _iface=$1
826     _ip=$2
827     _maskbits=$3
828
829     _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock"
830     mkdir -p "${_lockfile%/*}" # dirname
831     [ -f "$_lockfile" ] || touch "$_lockfile"
832
833     (
834         # Note: use of return/exit/die() below only gets us out of the
835         # sub-shell, which is actually what we want.  That is, the
836         # function should just return non-zero.
837
838         flock --timeout 30 0 || \
839             die "add_ip_to_iface: unable to get lock for ${_iface}"
840
841         # Ensure interface is up
842         ip link set "$_iface" up || \
843             die "Failed to bringup interface $_iface"
844
845         ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || \
846             die "Failed to add $_ip/$_maskbits on dev $_iface"
847     ) <"$_lockfile"
848
849     # Do nothing here - return above only gets us out of the subshell
850     # and doing anything here will affect the return code.
851 }
852
853 delete_ip_from_iface()
854 {
855     _iface=$1
856     _ip=$2
857     _maskbits=$3
858
859     _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock"
860     mkdir -p "${_lockfile%/*}" # dirname
861     [ -f "$_lockfile" ] || touch "$_lockfile"
862
863     (
864         # Note: use of return/exit/die() below only gets us out of the
865         # sub-shell, which is actually what we want.  That is, the
866         # function should just return non-zero.
867
868         flock --timeout 30 0 || \
869             die "delete_ip_from_iface: unable to get lock for ${_iface}"
870
871         _im="$_ip/$_maskbits"  # shorthand for readability
872
873         # "ip addr del" will delete all secondary IPs if this is the
874         # primary.  To work around this _very_ annoying behaviour we
875         # have to keep a record of the secondaries and re-add them
876         # afterwards.  Yuck!
877
878         _secondaries=""
879         if ip addr list dev "$_iface" primary | grep -Fq "inet $_im " ; then
880             _secondaries=$(ip addr list dev "$_iface" secondary | \
881                 awk '$1 == "inet" { print $2 }')
882         fi
883
884         local _rc=0
885         ip addr del "$_im" dev "$_iface" || {
886             echo "Failed to del $_ip on dev $_iface"
887             _rc=1
888         }
889
890         if [ -n "$_secondaries" ] ; then
891             for _i in $_secondaries; do
892                 if ip addr list dev "$_iface" | grep -Fq "inet $_i" ; then
893                     echo "Kept secondary $_i on dev $_iface"
894                 else
895                     echo "Re-adding secondary address $_i to dev $_iface"
896                     ip addr add $_i brd + dev $_iface || {
897                         echo "Failed to re-add address $_i to dev $_iface"
898                         _rc=1
899                     }
900                 fi
901             done
902         fi
903
904         return $_rc
905     ) <"$_lockfile"
906
907     # Do nothing here - return above only gets us out of the subshell
908     # and doing anything here will affect the return code.
909 }
910
911 # If the given IP is hosted then print 2 items: maskbits and iface 
912 ip_maskbits_iface ()
913 {
914     _addr="$1"
915
916     ip addr show to "${_addr}/32" 2>/dev/null | \
917         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
918 }
919
920 drop_ip ()
921 {
922     _addr="${1%/*}"  # Remove optional maskbits
923     _log_tag="$2"
924
925     set -- $(ip_maskbits_iface $_addr)
926     if [ -n "$1" ] ; then
927         _maskbits="$1"
928         _iface="$2"
929         if [ -n "$_log_tag" ] ; then
930             script_log "$_log_tag" \
931                 "Removing public address $_addr/$_maskbits from device $_iface"
932         fi
933         ip addr del $_addr/$_maskbits dev $_iface >/dev/null 2>&1
934     fi
935 }
936
937 drop_all_public_ips ()
938 {
939     _log_tag="$1"
940
941     while read _ip _x ; do
942         drop_ip "$_ip" "$_log_tag"
943     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
944 }
945
946 ########################################################
947 # some simple logic for counting events - per eventscript
948 # usage: ctdb_counter_init
949 #        ctdb_counter_incr
950 #        ctdb_check_counter_limit <limit>
951 # ctdb_check_counter_limit fails when count >= <limit>
952 ########################################################
953 _ctdb_counter_common () {
954     _service_name="${1:-${service_name:-${script_name}}}"
955     _counter_file="$ctdb_fail_dir/$_service_name"
956     mkdir -p "${_counter_file%/*}" # dirname
957 }
958 ctdb_counter_init () {
959     _ctdb_counter_common "$1"
960
961     >"$_counter_file"
962 }
963 ctdb_counter_incr () {
964     _ctdb_counter_common "$1"
965
966     # unary counting!
967     echo -n 1 >> "$_counter_file"
968 }
969 ctdb_check_counter_limit () {
970     _ctdb_counter_common
971
972     _limit="${1:-${service_fail_limit}}"
973     _quiet="$2"
974
975     # unary counting!
976     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
977     if [ $_size -ge $_limit ] ; then
978         echo "ERROR: more than $_limit consecutive failures for $service_name, marking cluster unhealthy"
979         exit 1
980     elif [ $_size -gt 0 -a -z "$_quiet" ] ; then
981         echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet"
982     fi
983 }
984 ctdb_check_counter () {
985     _msg="${1:-error}"  # "error"  - anything else is silent on fail
986     _op="${2:--ge}"  # an integer operator supported by test
987     _limit="${3:-${service_fail_limit}}"
988     shift 3
989     _ctdb_counter_common "$1"
990
991     # unary counting!
992     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
993     if [ $_size $_op $_limit ] ; then
994         if [ "$_msg" = "error" ] ; then
995             echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy"
996             exit 1              
997         else
998             return 1
999         fi
1000     fi
1001 }
1002
1003 ########################################################
1004
1005 ctdb_status_dir="$CTDB_VARDIR/status"
1006 ctdb_fail_dir="$CTDB_VARDIR/failcount"
1007
1008 ctdb_setup_service_state_dir ()
1009 {
1010     service_state_dir="$CTDB_VARDIR/state/${1:-${service_name}}"
1011     mkdir -p "$service_state_dir" || {
1012         echo "Error creating state dir \"$service_state_dir\""
1013         exit 1
1014     }
1015 }
1016
1017 ########################################################
1018 # Managed status history, for auto-start/stop
1019
1020 ctdb_managed_dir="$CTDB_VARDIR/managed_history"
1021
1022 _ctdb_managed_common ()
1023 {
1024     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
1025 }
1026
1027 ctdb_service_managed ()
1028 {
1029     _ctdb_managed_common
1030     mkdir -p "$ctdb_managed_dir"
1031     touch "$_ctdb_managed_file"
1032 }
1033
1034 ctdb_service_unmanaged ()
1035 {
1036     _ctdb_managed_common
1037     rm -f "$_ctdb_managed_file"
1038 }
1039
1040 is_ctdb_previously_managed_service ()
1041 {
1042     _ctdb_managed_common
1043     [ -f "$_ctdb_managed_file" ]
1044 }
1045
1046 ########################################################
1047 # Check and set status
1048
1049 log_status_cat ()
1050 {
1051     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
1052 }
1053
1054 ctdb_checkstatus ()
1055 {
1056     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1057         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1058         return 1
1059     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1060         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1061         return 2
1062     else
1063         return 0
1064     fi
1065 }
1066
1067 ctdb_setstatus ()
1068 {
1069     d="$ctdb_status_dir/$script_name"
1070     case "$1" in
1071         unhealthy|banned)
1072             mkdir -p "$d"
1073             cat "$2" >"$d/$1"
1074             ;;
1075         *)
1076             for i in "banned" "unhealthy" ; do
1077                 rm -f "$d/$i"
1078             done
1079             ;;
1080     esac
1081 }
1082
1083 ##################################################################
1084 # Reconfigure a service on demand
1085
1086 _ctdb_service_reconfigure_common ()
1087 {
1088     _d="$ctdb_status_dir/${service_name}"
1089     mkdir -p "$_d"
1090     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1091 }
1092
1093 ctdb_service_needs_reconfigure ()
1094 {
1095     _ctdb_service_reconfigure_common
1096     [ -e "$_ctdb_service_reconfigure_flag" ]
1097 }
1098
1099 ctdb_service_set_reconfigure ()
1100 {
1101     _ctdb_service_reconfigure_common
1102     >"$_ctdb_service_reconfigure_flag"
1103 }
1104
1105 ctdb_service_unset_reconfigure ()
1106 {
1107     _ctdb_service_reconfigure_common
1108     rm -f "$_ctdb_service_reconfigure_flag"
1109 }
1110
1111 ctdb_service_reconfigure ()
1112 {
1113     echo "Reconfiguring service \"${service_name}\"..."
1114     ctdb_service_unset_reconfigure
1115     service_reconfigure || return $?
1116     ctdb_counter_init
1117 }
1118
1119 # Default service_reconfigure() function does nothing.
1120 service_reconfigure ()
1121 {
1122     :
1123 }
1124
1125 ctdb_reconfigure_try_lock ()
1126 {
1127     _ctdb_service_reconfigure_common
1128     _lock="${_d}/reconfigure_lock"
1129     mkdir -p "${_lock%/*}" # dirname
1130     touch "$_lock"
1131
1132     (
1133         flock 0
1134         # This is overkill but will work if we need to extend this to
1135         # allow certain events to run multiple times in parallel
1136         # (e.g. takeip) and write multiple PIDs to the file.
1137         read _locker_event 
1138         if [ -n "$_locker_event" ] ; then
1139             while read _pid ; do
1140                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1141                     kill -0 "$_pid" 2>/dev/null ; then
1142                     exit 1
1143                 fi
1144             done
1145         fi
1146
1147         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1148         exit 0
1149     ) <"$_lock"
1150 }
1151
1152 ctdb_replay_monitor_status ()
1153 {
1154     echo "Replaying previous status for this script due to reconfigure..."
1155     # Leading colon (':') is missing in some versions...
1156     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1157     # Output looks like this:
1158     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1159     # This is the cheapest way of getting fields in the middle.
1160     set -- $(IFS=":" ; echo $_out)
1161     _code="$3"
1162     _status="$4"
1163     # The error output field can include colons so we'll try to
1164     # preserve them.  The weak checking at the beginning tries to make
1165     # this work for both broken (no leading ':') and fixed output.
1166     _out="${_out%:}"
1167     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1168     case "$_status" in
1169         OK) : ;;  # Do nothing special.
1170         TIMEDOUT)
1171             # Recast this as an error, since we can't exit with the
1172             # correct negative number.
1173             _code=1
1174             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1175             ;;
1176         DISABLED)
1177             # Recast this as an OK, since we can't exit with the
1178             # correct negative number.
1179             _code=0
1180             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1181             ;;
1182         *) : ;;  # Must be ERROR, do nothing special.
1183     esac
1184     echo "$_err_out"
1185     exit $_code
1186 }
1187
1188 ctdb_service_check_reconfigure ()
1189 {
1190     assert_service_name
1191
1192     # We only care about some events in this function.  For others we
1193     # return now.
1194     case "$event_name" in
1195         monitor|ipreallocated|reconfigure) : ;;
1196         *) return 0 ;;
1197     esac
1198
1199     if ctdb_reconfigure_try_lock ; then
1200         # No events covered by this function are running, so proceed
1201         # with gay abandon.
1202         case "$event_name" in
1203             reconfigure)
1204                 (ctdb_service_reconfigure)
1205                 exit $?
1206                 ;;
1207             ipreallocated)
1208                 if ctdb_service_needs_reconfigure ; then
1209                     ctdb_service_reconfigure
1210                 fi
1211                 ;;
1212             monitor)
1213                 if ctdb_service_needs_reconfigure ; then
1214                     ctdb_service_reconfigure
1215                     # Given that the reconfigure might not have
1216                     # resulted in the service being stable yet, we
1217                     # replay the previous status since that's the best
1218                     # information we have.
1219                     ctdb_replay_monitor_status
1220                 fi
1221                 ;;
1222         esac
1223     else
1224         # Somebody else is running an event we don't want to collide
1225         # with.  We proceed with caution.
1226         case "$event_name" in
1227             reconfigure)
1228                 # Tell whoever called us to retry.
1229                 exit 2
1230                 ;;
1231             ipreallocated)
1232                 # Defer any scheduled reconfigure and just run the
1233                 # rest of the ipreallocated event, as per the
1234                 # eventscript.  There's an assumption here that the
1235                 # event doesn't depend on any scheduled reconfigure.
1236                 # This is true in the current code.
1237                 return 0
1238                 ;;
1239             monitor)
1240                 # There is most likely a reconfigure in progress so
1241                 # the service is possibly unstable.  As above, we
1242                 # defer any scheduled reconfigured.  We also replay
1243                 # the previous monitor status since that's the best
1244                 # information we have.
1245                 ctdb_replay_monitor_status
1246                 ;;
1247         esac
1248     fi
1249 }
1250
1251 ##################################################################
1252 # Does CTDB manage this service? - and associated auto-start/stop
1253
1254 ctdb_compat_managed_service ()
1255 {
1256     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1257         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1258     fi
1259 }
1260
1261 is_ctdb_managed_service ()
1262 {
1263     assert_service_name
1264
1265     # $t is used just for readability and to allow better accurate
1266     # matching via leading/trailing spaces
1267     t=" $CTDB_MANAGED_SERVICES "
1268
1269     # Return 0 if "<space>$service_name<space>" appears in $t
1270     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1271         return 0
1272     fi
1273
1274     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1275     # backward compatibility and try again.
1276     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1277     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1278     ctdb_compat_managed_service "$CTDB_MANAGES_SCP"      "scp"
1279     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1280     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1281     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1282     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1283     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1284     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1285     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1286
1287     t=" $CTDB_MANAGED_SERVICES "
1288
1289     # Return 0 if "<space>$service_name<space>" appears in $t
1290     [ "${t#* ${service_name} }" != "${t}" ]
1291 }
1292
1293 ctdb_start_stop_service ()
1294 {
1295     assert_service_name
1296
1297     # Allow service-start/service-stop pseudo-events to start/stop
1298     # services when we're not auto-starting/stopping and we're not
1299     # monitoring.
1300     case "$event_name" in
1301         service-start)
1302             if is_ctdb_managed_service ; then
1303                 die 'service-start event not permitted when service is managed'
1304             fi
1305             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1306                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1307             fi
1308             ctdb_service_start
1309             exit $?
1310             ;;
1311         service-stop)
1312             if is_ctdb_managed_service ; then
1313                 die 'service-stop event not permitted when service is managed'
1314             fi
1315             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1316                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1317             fi
1318             ctdb_service_stop
1319             exit $?
1320             ;;
1321     esac
1322
1323     # Do nothing unless configured to...
1324     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1325
1326     [ "$event_name" = "monitor" ] || return 0
1327
1328     if is_ctdb_managed_service ; then
1329         if ! is_ctdb_previously_managed_service ; then
1330             echo "Starting service \"$service_name\" - now managed"
1331             background_with_logging ctdb_service_start
1332             exit $?
1333         fi
1334     else
1335         if is_ctdb_previously_managed_service ; then
1336             echo "Stopping service \"$service_name\" - no longer managed"
1337             background_with_logging ctdb_service_stop
1338             exit $?
1339         fi
1340     fi
1341 }
1342
1343 ctdb_service_start ()
1344 {
1345     # The service is marked managed if we've ever tried to start it.
1346     ctdb_service_managed
1347
1348     service_start || return $?
1349
1350     ctdb_counter_init
1351     ctdb_check_tcp_init
1352 }
1353
1354 ctdb_service_stop ()
1355 {
1356     ctdb_service_unmanaged
1357     service_stop
1358 }
1359
1360 # Default service_start() and service_stop() functions.
1361  
1362 # These may be overridden in an eventscript.  When overriding, the
1363 # following convention must be followed.  If these functions are
1364 # called with no arguments then they may use internal logic to
1365 # determine whether the service is managed and, therefore, whether
1366 # they should take any action.  However, if the service name is
1367 # specified as an argument then an attempt must be made to start or
1368 # stop the service.  This is because the auto-start/stop code calls
1369 # them with the service name as an argument.
1370 service_start ()
1371 {
1372     service "$service_name" start
1373 }
1374
1375 service_stop ()
1376 {
1377     service "$service_name" stop
1378 }
1379
1380 ##################################################################
1381
1382 ctdb_standard_event_handler ()
1383 {
1384     case "$1" in
1385         status)
1386             ctdb_checkstatus
1387             exit
1388             ;;
1389         setstatus)
1390             shift
1391             ctdb_setstatus "$@"
1392             exit
1393             ;;
1394     esac
1395 }
1396
1397 # iptables doesn't like being re-entered, so flock-wrap it.
1398 iptables()
1399 {
1400         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1401 }
1402
1403 ########################################################
1404 # tickle handling
1405 ########################################################
1406
1407 update_tickles ()
1408 {
1409         _port="$1"
1410
1411         tickledir="$CTDB_VARDIR/state/tickles"
1412         mkdir -p "$tickledir"
1413
1414         # Who am I?
1415         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1416
1417         # What public IPs do I hold?
1418         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1419
1420         # IPs as a regexp choice
1421         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1422
1423         # Record connections to our public IPs in a temporary file
1424         _my_connections="${tickledir}/${_port}.connections"
1425         rm -f "$_my_connections"
1426         netstat -tn |
1427         awk -v destpat="^${_ipschoice}:${_port}\$" \
1428           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1429         sort >"$_my_connections"
1430
1431         # Record our current tickles in a temporary file
1432         _my_tickles="${tickledir}/${_port}.tickles"
1433         rm -f "$_my_tickles"
1434         for _i in $_ips ; do
1435                 ctdb -Y gettickles $_i $_port | 
1436                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1437         done |
1438         sort >"$_my_tickles"
1439
1440         # Add tickles for connections that we haven't already got tickles for
1441         comm -23 "$_my_connections" "$_my_tickles" |
1442         while read _src _dst ; do
1443                 ctdb addtickle $_src $_dst
1444         done
1445
1446         # Remove tickles for connections that are no longer there
1447         comm -13 "$_my_connections" "$_my_tickles" |
1448         while read _src _dst ; do
1449                 ctdb deltickle $_src $_dst
1450         done
1451
1452         rm -f "$_my_connections" "$_my_tickles" 
1453 }
1454
1455 ########################################################
1456 # load a site local config file
1457 ########################################################
1458
1459 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1460         . "$CTDB_RC_LOCAL"
1461 }
1462
1463 [ -x $CTDB_BASE/rc.local ] && {
1464         . $CTDB_BASE/rc.local
1465 }
1466
1467 [ -d $CTDB_BASE/rc.local.d ] && {
1468         for i in $CTDB_BASE/rc.local.d/* ; do
1469                 [ -x "$i" ] && . "$i"
1470         done
1471 }
1472
1473 script_name="${0##*/}"       # basename
1474 service_fail_limit=1
1475 event_name="$1"