ctdb/config/functions

   1 # Hey Emacs, this is a -*- shell-script -*- !!!
   2
   3 # utility functions for ctdb event scripts
   4
   5 [ -z "$CTDB_VARDIR" ] && {
   6     if [ -d "/var/lib/ctdb" ] ; then
   7         export CTDB_VARDIR="/var/lib/ctdb"
   8     else
   9         export CTDB_VARDIR="/var/ctdb"
  10     fi
  11 }
  12 [ -z "$CTDB_ETCDIR" ] && {
  13     export CTDB_ETCDIR="/etc"
  14 }
  15
  16 #######################################
  17 # pull in a system config file, if any
  18 _loadconfig() {
  19
  20     if [ -z "$1" ] ; then
  21         foo="${service_config:-${service_name}}"
  22         if [ -n "$foo" ] ; then
  23             loadconfig "$foo"
  24             return
  25         fi
  26     fi
  27
  28     if [ "$1" != "ctdb" ] ; then
  29         loadconfig "ctdb"
  30     fi
  31
  32     if [ -z "$1" ] ; then
  33         return
  34     fi
  35
  36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
  37         . $CTDB_ETCDIR/sysconfig/$1
  38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
  39         . $CTDB_ETCDIR/default/$1
  40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
  41         . $CTDB_BASE/sysconfig/$1
  42     fi
  43
  44     if [ "$1" = "ctdb" ] ; then
  45         _config="${CTDB_BASE}/ctdbd.conf"
  46         if [ -r "$_config" ] ; then
  47             . "$_config"
  48         fi
  49     fi
  50 }
  51
  52 loadconfig () {
  53     _loadconfig "$@"
  54 }
  55
  56 ##############################################################
  57
  58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
  59 # configuration file.
  60 debug ()
  61 {
  62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
  63         # If there are arguments then echo them.  Otherwise expect to
  64         # use stdin, which allows us to pass lots of debug using a
  65         # here document.
  66         if [ -n "$1" ] ; then
  67             echo "DEBUG: $*"
  68         elif ! tty -s ; then
  69             sed -e 's@^@DEBUG: @'
  70         fi
  71     fi
  72 }
  73
  74 die ()
  75 {
  76     _msg="$1"
  77     _rc="${2:-1}"
  78
  79     echo "$_msg"
  80     exit $_rc
  81 }
  82
  83 # Log given message or stdin to either syslog or a CTDB log file
  84 # $1 is the tag passed to logger if syslog is in use.
  85 script_log ()
  86 {
  87     _tag="$1" ; shift
  88
  89     case "$CTDB_LOGGING" in
  90         file:*|"")
  91             if [ -n "$CTDB_LOGGING" ] ; then
  92                 _file="${CTDB_LOGGING#file:}"
  93             else
  94                 _file="/var/log/log.ctdb"
  95             fi
  96             {
  97                 if [ -n "$*" ] ; then
  98                     echo "$*"
  99                 else
 100                     cat
 101                 fi
 102             } >>"$_file"
 103             ;;
 104         *)
 105             # Handle all syslog:* variants here too.  There's no tool to do
 106             # the lossy things, so just use logger.
 107             logger -t "ctdbd: ${_tag}" $*
 108             ;;
 109     esac
 110 }
 111
 112 # When things are run in the background in an eventscript then logging
 113 # output might get lost.  This is the "solution".  :-)
 114 background_with_logging ()
 115 {
 116     (
 117         "$@" 2>&1 </dev/null |
 118         script_log "${script_name}&"
 119     )&
 120
 121     return 0
 122 }
 123
 124 ##############################################################
 125 # check number of args for different events
 126 ctdb_check_args ()
 127 {
 128     case "$1" in
 129         takeip|releaseip)
 130             if [ $# != 4 ]; then
 131                 echo "ERROR: must supply interface, IP and maskbits"
 132                 exit 1
 133             fi
 134             ;;
 135         updateip)
 136             if [ $# != 5 ]; then
 137                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
 138                 exit 1
 139             fi
 140             ;;
 141     esac
 142 }
 143
 144 ##############################################################
 145 # determine on what type of system (init style) we are running
 146 detect_init_style()
 147 {
 148     # only do detection if not already set:
 149     [ -z "$CTDB_INIT_STYLE" ] || return
 150
 151     if [ -x /sbin/startproc ]; then
 152         CTDB_INIT_STYLE="suse"
 153     elif [ -x /sbin/start-stop-daemon ]; then
 154         CTDB_INIT_STYLE="debian"
 155     else
 156         CTDB_INIT_STYLE="redhat"
 157     fi
 158 }
 159
 160 ######################################################
 161 # simulate /sbin/service on platforms that don't have it
 162 # _service() makes it easier to hook the service() function for
 163 # testing.
 164 _service ()
 165 {
 166   _service_name="$1"
 167   _op="$2"
 168
 169   # do nothing, when no service was specified
 170   [ -z "$_service_name" ] && return
 171
 172   if [ -x /sbin/service ]; then
 173       $_nice /sbin/service "$_service_name" "$_op"
 174   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
 175       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
 176   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
 177       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
 178   fi
 179 }
 180
 181 service()
 182 {
 183     _nice=""
 184     _service "$@"
 185 }
 186
 187 ######################################################
 188 # simulate /sbin/service (niced) on platforms that don't have it
 189 nice_service()
 190 {
 191     _nice="nice"
 192     _service "$@"
 193 }
 194
 195 ######################################################
 196 # wrapper around /proc/ settings to allow them to be hooked
 197 # for testing
 198 # 1st arg is relative path under /proc/, 2nd arg is value to set
 199 set_proc ()
 200 {
 201     echo "$2" >"/proc/$1"
 202 }
 203
 204 ######################################################
 205 # wrapper around getting file contents from /proc/ to allow
 206 # this to be hooked for testing
 207 # 1st arg is relative path under /proc/
 208 get_proc ()
 209 {
 210     cat "/proc/$1"
 211 }
 212
 213 ######################################################
 214 # Check that an RPC service is healthy -
 215 # this includes allowing a certain number of failures
 216 # before marking the NFS service unhealthy.
 217 #
 218 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
 219 #
 220 # each triple is a set of 3 arguments: an operator, a
 221 # fail count limit and an action string.
 222 #
 223 # For example:
 224 #
 225 #       nfs_check_rpc_service "lockd" \
 226 #           -ge 15 "verbose restart unhealthy" \
 227 #           -eq 10 "restart:bs"
 228 #
 229 # says that if lockd is down for 15 iterations then do
 230 # a verbose restart of lockd and mark the node unhealthy.
 231 # Before this, after 10 iterations of failure, the
 232 # service is restarted silently in the background.
 233 # Order is important: the number of failures need to be
 234 # specified in reverse order because processing stops
 235 # after the first condition that is true.
 236 ######################################################
 237 nfs_check_rpc_service ()
 238 {
 239     _prog_name="$1" ; shift
 240
 241     if _nfs_check_rpc_common "$_prog_name" ; then
 242         return
 243     fi
 244
 245     while [ -n "$3" ] ; do
 246         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
 247             break
 248         fi
 249         shift 3
 250     done
 251 }
 252
 253 # The new way of doing things...
 254 nfs_check_rpc_services ()
 255 {
 256     # Files must end with .check - avoids editor backups, RPM fu, ...
 257     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
 258         _t="${_f%.check}"
 259         _prog_name="${_t##*/[0-9][0-9].}"
 260
 261         if _nfs_check_rpc_common "$_prog_name" ; then
 262             # This RPC service is up, check next service...
 263             continue
 264         fi
 265
 266         # Check each line in the file in turn until one of the limit
 267         # checks is hit...
 268         while read _cmp _lim _rest ; do
 269             # Skip comments
 270             case "$_cmp" in
 271                 \#*) continue ;;
 272             esac
 273
 274             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
 275                 # Limit was hit on this line, no further checking...
 276                 break
 277             fi
 278         done <"$_f"
 279     done
 280 }
 281
 282 _nfs_check_rpc_common ()
 283 {
 284     _prog_name="$1"
 285
 286     # Some platforms don't have separate programs for all services.
 287     case "$_prog_name" in
 288         statd)
 289             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
 290     esac
 291
 292     case "$_prog_name" in
 293         nfsd)
 294             _rpc_prog=nfs
 295             _version=3
 296             ;;
 297         mountd)
 298             _rpc_prog=mountd
 299             _version=1
 300             ;;
 301         rquotad)
 302             _rpc_prog=rquotad
 303             _version=1
 304             ;;
 305         lockd)
 306             _rpc_prog=nlockmgr
 307             _version=4
 308             ;;
 309         statd)
 310             _rpc_prog=status
 311             _version=1
 312             ;;
 313         *)
 314             echo "Internal error: unknown RPC program \"$_prog_name\"."
 315             exit 1
 316     esac
 317
 318     _service_name="nfs_${_prog_name}"
 319
 320     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
 321         ctdb_counter_init "$_service_name"
 322         return 0
 323     fi
 324
 325     ctdb_counter_incr "$_service_name"
 326
 327     return 1
 328 }
 329
 330 _nfs_check_rpc_action ()
 331 {
 332     _cmp="$1"
 333     _limit="$2"
 334     _actions="$3"
 335
 336     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
 337         return 1
 338     fi
 339
 340     for _action in $_actions ; do
 341         case "$_action" in
 342             verbose)
 343                 echo "$ctdb_check_rpc_out"
 344                 ;;
 345             restart)
 346                 _nfs_restart_rpc_service "$_prog_name"
 347                 ;;
 348             restart:b)
 349                 _nfs_restart_rpc_service "$_prog_name" true
 350                 ;;
 351             unhealthy)
 352                 exit 1
 353                 ;;
 354             *)
 355                 echo "Internal error: unknown action \"$_action\"."
 356                 exit 1
 357         esac
 358     done
 359
 360     return 0
 361 }
 362
 363 _nfs_restart_rpc_service ()
 364 {
 365     _prog_name="$1"
 366     _background="${2:-false}"
 367
 368     if $_background ; then
 369         _maybe_background="background_with_logging"
 370     else
 371         _maybe_background=""
 372     fi
 373
 374     _p="rpc.${_prog_name}"
 375
 376     case "$_prog_name" in
 377         nfsd)
 378             echo "Trying to restart NFS service"
 379             $_maybe_background startstop_nfs restart
 380             ;;
 381         mountd)
 382             echo "Trying to restart $_prog_name [${_p}]"
 383             killall -q -9 "$_p"
 384             $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
 385             ;;
 386         rquotad)
 387             echo "Trying to restart $_prog_name [${_p}]"
 388             killall -q -9 "$_p"
 389             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
 390             ;;
 391         lockd)
 392             echo "Trying to restart lock manager service"
 393             $_maybe_background startstop_nfslock restart
 394             ;;
 395         statd)
 396             echo "Trying to restart $_prog_name [${_p}]"
 397             killall -q -9 "$_p"
 398             $_maybe_background $_p \
 399                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
 400                 ${STATD_PORT:+-p} $STATD_PORT \
 401                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
 402             ;;
 403         *)
 404             echo "Internal error: unknown RPC program \"$_prog_name\"."
 405             exit 1
 406     esac
 407 }
 408
 409 ######################################################
 410 # check that a rpc server is registered with portmap
 411 # and responding to requests
 412 # usage: ctdb_check_rpc SERVICE_NAME VERSION
 413 ######################################################
 414 ctdb_check_rpc ()
 415 {
 416     progname="$1"
 417     version="$2"
 418
 419     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
 420
 421     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
 422         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 423 $ctdb_check_rpc_out"
 424         echo "$ctdb_check_rpc_out"
 425         return 1
 426     fi
 427 }
 428
 429 ######################################################
 430 # Ensure $service_name is set
 431 assert_service_name ()
 432 {
 433     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
 434 }
 435
 436 ######################################################
 437 # check a set of directories is available
 438 # return 1 on a missing directory
 439 # directories are read from stdin
 440 ######################################################
 441 ctdb_check_directories_probe()
 442 {
 443     while IFS="" read d ; do
 444         case "$d" in
 445             *%*)
 446                 continue
 447                 ;;
 448             *)
 449                 [ -d "${d}/." ] || return 1
 450         esac
 451     done
 452 }
 453
 454 ######################################################
 455 # check a set of directories is available
 456 # directories are read from stdin
 457 ######################################################
 458 ctdb_check_directories()
 459 {
 460     ctdb_check_directories_probe || {
 461         echo "ERROR: $service_name directory \"$d\" not available"
 462         exit 1
 463     }
 464 }
 465
 466 ######################################################
 467 # check a set of tcp ports
 468 # usage: ctdb_check_tcp_ports <ports...>
 469 ######################################################
 470
 471 # This flag file is created when a service is initially started.  It
 472 # is deleted the first time TCP port checks for that service succeed.
 473 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
 474 # message if a port check fails.
 475 _ctdb_check_tcp_common ()
 476 {
 477     assert_service_name
 478     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
 479 }
 480
 481 ctdb_check_tcp_init ()
 482 {
 483     _ctdb_check_tcp_common
 484     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
 485     touch "$_ctdb_service_started_file"
 486 }
 487
 488 # Check whether something is listening on all of the given TCP ports
 489 # using the "ctdb checktcpport" command.
 490 ctdb_check_tcp_ports()
 491 {
 492     if [ -z "$1" ] ; then
 493         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
 494         exit 1
 495     fi
 496
 497     for _p ; do  # process each function argument (port)
 498         _cmd="ctdb checktcpport $_p"
 499         _out=$($_cmd 2>&1)
 500         _ret=$?
 501         case "$_ret" in
 502             0)
 503                 _ctdb_check_tcp_common
 504                 if [ ! -f "$_ctdb_service_started_file" ] ; then
 505                     echo "ERROR: $service_name tcp port $_p is not responding"
 506                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
 507                 else
 508                     echo "INFO: $service_name tcp port $_p is not responding"
 509                 fi
 510
 511                 return 1
 512                 ;;
 513             98)
 514                 # Couldn't bind, something already listening, next port...
 515                 continue
 516                 ;;
 517             *)
 518                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
 519                 debug <<EOF
 520 ctdb checktcpport (exited with $_ret) with output:
 521 $_out"
 522 EOF
 523                 return $_ret
 524         esac
 525     done
 526
 527     # All ports listening
 528     _ctdb_check_tcp_common
 529     rm -f "$_ctdb_service_started_file"
 530     return 0
 531 }
 532
 533 ######################################################
 534 # check a unix socket
 535 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
 536 ######################################################
 537 ctdb_check_unix_socket() {
 538     socket_path="$1"
 539     [ -z "$socket_path" ] && return
 540
 541     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
 542         echo "ERROR: $service_name socket $socket_path not found"
 543         return 1
 544     fi
 545 }
 546
 547 ######################################################
 548 # check a command returns zero status
 549 # usage: ctdb_check_command <command>
 550 ######################################################
 551 ctdb_check_command ()
 552 {
 553     _out=$("$@" 2>&1) || {
 554         echo "ERROR: $* returned error"
 555         echo "$_out" | debug
 556         exit 1
 557     }
 558 }
 559
 560 ################################################
 561 # kill off any TCP connections with the given IP
 562 ################################################
 563 kill_tcp_connections ()
 564 {
 565     _ip="$1"
 566
 567     _oneway=false
 568     if [ "$2" = "oneway" ] ; then
 569         _oneway=true
 570     fi
 571
 572     get_tcp_connections_for_ip "$_ip" | {
 573         _killcount=0
 574         _connections=""
 575         _nl="
 576 "
 577         while read _dst _src; do
 578             _destport="${_dst##*:}"
 579             __oneway=$_oneway
 580             case $_destport in
 581                 # we only do one-way killtcp for CIFS
 582                 139|445) __oneway=true ;;
 583             esac
 584
 585             echo "Killing TCP connection $_src $_dst"
 586             _connections="${_connections}${_nl}${_src} ${_dst}"
 587             if ! $__oneway ; then
 588                 _connections="${_connections}${_nl}${_dst} ${_src}"
 589             fi
 590
 591             _killcount=$(($_killcount + 1))
 592         done
 593
 594         if [ $_killcount -eq 0 ] ; then
 595             return
 596         fi
 597
 598         echo "$_connections" | ctdb killtcp || {
 599             echo "Failed to send killtcp control"
 600             return
 601         }
 602
 603         _count=0
 604         while : ; do
 605             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
 606
 607             if [ $_remaining -eq 0 ] ; then
 608                 echo "Killed $_killcount TCP connections to released IP $_ip"
 609                 return
 610             fi
 611
 612             _count=$(($_count + 1))
 613             if [ $_count -gt 3 ] ; then
 614                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
 615                 return
 616             fi
 617
 618             echo "Waiting for $_remaining connections to be killed for IP $_ip"
 619             sleep 1
 620         done
 621     }
 622 }
 623
 624 ##################################################################
 625 # kill off the local end for any TCP connections with the given IP
 626 ##################################################################
 627 kill_tcp_connections_local_only ()
 628 {
 629     kill_tcp_connections "$1" "oneway"
 630 }
 631
 632 ##################################################################
 633 # tickle any TCP connections with the given IP
 634 ##################################################################
 635 tickle_tcp_connections ()
 636 {
 637     _ip="$1"
 638
 639     get_tcp_connections_for_ip "$_ip" |
 640     {
 641         _failed=false
 642
 643         while read dest src; do
 644             echo "Tickle TCP connection $src $dest"
 645             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
 646             echo "Tickle TCP connection $dest $src"
 647             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
 648         done
 649
 650         if $_failed ; then
 651             echo "Failed to send tickle control"
 652         fi
 653     }
 654 }
 655
 656 get_tcp_connections_for_ip ()
 657 {
 658     _ip="$1"
 659
 660     netstat -tn | awk -v ip=$_ip \
 661         'index($1, "tcp") == 1 && \
 662          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
 663          && $6 == "ESTABLISHED" \
 664          {print $4" "$5}'
 665 }
 666
 667 ########################################################
 668 # start/stop the Ganesha nfs service
 669 ########################################################
 670 startstop_ganesha()
 671 {
 672     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
 673     case "$1" in
 674         start)
 675             service "$_service_name" start
 676             ;;
 677         stop)
 678             service "$_service_name" stop
 679             ;;
 680         restart)
 681             service "$_service_name" restart
 682             ;;
 683     esac
 684 }
 685
 686 ########################################################
 687 # start/stop the nfs service on different platforms
 688 ########################################################
 689 startstop_nfs() {
 690         PLATFORM="unknown"
 691         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 692                 PLATFORM="sles"
 693         }
 694         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 695             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 696                 PLATFORM="rhel"
 697         }
 698
 699         case $PLATFORM in
 700         sles)
 701                 case $1 in
 702                 start)
 703                         service nfsserver start
 704                         ;;
 705                 stop)
 706                         service nfsserver stop > /dev/null 2>&1
 707                         ;;
 708                 restart)
 709                         set_proc "fs/nfsd/threads" 0
 710                         service nfsserver stop > /dev/null 2>&1
 711                         pkill -9 nfsd
 712                         nfs_dump_some_threads
 713                         service nfsserver start
 714                         ;;
 715                 esac
 716                 ;;
 717         rhel)
 718                 case $1 in
 719                 start)
 720                         service nfslock start
 721                         service nfs start
 722                         ;;
 723                 stop)
 724                         service nfs stop
 725                         service nfslock stop
 726                         ;;
 727                 restart)
 728                         set_proc "fs/nfsd/threads" 0
 729                         service nfs stop > /dev/null 2>&1
 730                         service nfslock stop > /dev/null 2>&1
 731                         pkill -9 nfsd
 732                         nfs_dump_some_threads
 733                         service nfslock start
 734                         service nfs start
 735                         ;;
 736                 esac
 737                 ;;
 738         *)
 739                 echo "Unknown platform. NFS is not supported with ctdb"
 740                 exit 1
 741                 ;;
 742         esac
 743 }
 744
 745 # Dump up to the configured number of nfsd thread backtraces.
 746 nfs_dump_some_threads ()
 747 {
 748     [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || CTDB_NFS_DUMP_STUCK_THREADS=5
 749
 750     # Optimisation to avoid running an unnecessary pidof
 751     [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
 752
 753     _count=0
 754     for _pid in $(pidof nfsd) ; do
 755         [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
 756
 757         # Do this first to avoid racing with thread exit
 758         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
 759         if [ -n "$_stack" ] ; then
 760             echo "Stack trace for stuck nfsd thread [${_pid}]:"
 761             echo "$_stack"
 762             _count=$(($_count + 1))
 763         fi
 764     done
 765 }
 766
 767 ########################################################
 768 # start/stop the nfs lockmanager service on different platforms
 769 ########################################################
 770 startstop_nfslock() {
 771         PLATFORM="unknown"
 772         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 773                 PLATFORM="sles"
 774         }
 775         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 776             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 777                 PLATFORM="rhel"
 778         }
 779
 780         case $PLATFORM in
 781         sles)
 782                 # for sles there is no service for lockmanager
 783                 # so we instead just shutdown/restart nfs
 784                 case $1 in
 785                 start)
 786                         service nfsserver start
 787                         ;;
 788                 stop)
 789                         service nfsserver stop > /dev/null 2>&1
 790                         ;;
 791                 restart)
 792                         service nfsserver stop > /dev/null 2>&1
 793                         service nfsserver start
 794                         ;;
 795                 esac
 796                 ;;
 797         rhel)
 798                 case $1 in
 799                 start)
 800                         service nfslock start
 801                         ;;
 802                 stop)
 803                         service nfslock stop > /dev/null 2>&1
 804                         ;;
 805                 restart)
 806                         service nfslock stop > /dev/null 2>&1
 807                         service nfslock start
 808                         ;;
 809                 esac
 810                 ;;
 811         *)
 812                 echo "Unknown platform. NFS locking is not supported with ctdb"
 813                 exit 1
 814                 ;;
 815         esac
 816 }
 817
 818 # Periodically update the statd database
 819 nfs_statd_update ()
 820 {
 821     _update_period="$1"
 822
 823     _statd_update_trigger="$service_state_dir/update-trigger"
 824     [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
 825
 826     _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
 827     _current_time=$(date +"%s")
 828     if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
 829         touch "$_statd_update_trigger"
 830         $CTDB_BASE/statd-callout updatelocal &
 831         $CTDB_BASE/statd-callout updateremote &
 832     fi
 833 }
 834
 835 ########################################################
 836
 837 add_ip_to_iface ()
 838 {
 839     _iface=$1
 840     _ip=$2
 841     _maskbits=$3
 842
 843     # Ensure interface is up
 844     ip link set "$_iface" up || \
 845         die "Failed to bringup interface $_iface"
 846
 847     ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || {
 848         echo "Failed to add $_ip/$_maskbits on dev $_iface"
 849         return 1
 850     }
 851 }
 852
 853 delete_ip_from_iface()
 854 {
 855     _iface=$1
 856     _ip=$2
 857     _maskbits=$3
 858
 859     # This could be set globally for all interfaces but it is probably
 860     # better to avoid surprises, so limit it the interfaces where CTDB
 861     # has public IP addresses.  There isn't anywhere else convenient
 862     # to do this so just set it each time.  This is much cheaper than
 863     # remembering and re-adding secondaries.
 864     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
 865
 866     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
 867         echo "Failed to del $_ip on dev $_iface"
 868         return 1
 869     }
 870 }
 871
 872 # If the given IP is hosted then print 2 items: maskbits and iface
 873 ip_maskbits_iface ()
 874 {
 875     _addr="$1"
 876
 877     ip addr show to "${_addr}/32" 2>/dev/null | \
 878         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
 879 }
 880
 881 drop_ip ()
 882 {
 883     _addr="${1%/*}"  # Remove optional maskbits
 884
 885     set -- $(ip_maskbits_iface $_addr)
 886     if [ -n "$1" ] ; then
 887         _maskbits="$1"
 888         _iface="$2"
 889         echo "Removing public address $_addr/$_maskbits from device $_iface"
 890         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
 891     fi
 892 }
 893
 894 drop_all_public_ips ()
 895 {
 896     while read _ip _x ; do
 897         drop_ip "$_ip"
 898     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
 899 }
 900
 901 ########################################################
 902 # Simple counters
 903 _ctdb_counter_common () {
 904     _service_name="${1:-${service_name:-${script_name}}}"
 905     _counter_file="$ctdb_fail_dir/$_service_name"
 906     mkdir -p "${_counter_file%/*}" # dirname
 907 }
 908 ctdb_counter_init () {
 909     _ctdb_counter_common "$1"
 910
 911     >"$_counter_file"
 912 }
 913 ctdb_counter_incr () {
 914     _ctdb_counter_common "$1"
 915
 916     # unary counting!
 917     echo -n 1 >> "$_counter_file"
 918 }
 919 ctdb_check_counter () {
 920     _msg="${1:-error}"  # "error"  - anything else is silent on fail
 921     _op="${2:--ge}"  # an integer operator supported by test
 922     _limit="${3:-${service_fail_limit}}"
 923     shift 3
 924     _ctdb_counter_common "$1"
 925
 926     # unary counting!
 927     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 928     _hit=false
 929     if [ "$_op" != "%" ] ; then
 930         if [ $_size $_op $_limit ] ; then
 931             _hit=true
 932         fi
 933     else
 934         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
 935             _hit=true
 936         fi
 937     fi
 938     if $_hit ; then
 939         if [ "$_msg" = "error" ] ; then
 940             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
 941             exit 1
 942         else
 943             return 1
 944         fi
 945     fi
 946 }
 947
 948 ########################################################
 949
 950 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
 951 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
 952
 953 ctdb_setup_service_state_dir ()
 954 {
 955     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
 956     mkdir -p "$service_state_dir" || {
 957         echo "Error creating state dir \"$service_state_dir\""
 958         exit 1
 959     }
 960 }
 961
 962 ########################################################
 963 # Managed status history, for auto-start/stop
 964
 965 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
 966
 967 _ctdb_managed_common ()
 968 {
 969     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
 970 }
 971
 972 ctdb_service_managed ()
 973 {
 974     _ctdb_managed_common
 975     mkdir -p "$ctdb_managed_dir"
 976     touch "$_ctdb_managed_file"
 977 }
 978
 979 ctdb_service_unmanaged ()
 980 {
 981     _ctdb_managed_common
 982     rm -f "$_ctdb_managed_file"
 983 }
 984
 985 is_ctdb_previously_managed_service ()
 986 {
 987     _ctdb_managed_common
 988     [ -f "$_ctdb_managed_file" ]
 989 }
 990
 991 ########################################################
 992 # Check and set status
 993
 994 log_status_cat ()
 995 {
 996     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
 997 }
 998
 999 ctdb_checkstatus ()
1000 {
1001     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1002         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1003         return 1
1004     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1005         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1006         return 2
1007     else
1008         return 0
1009     fi
1010 }
1011
1012 ctdb_setstatus ()
1013 {
1014     d="$ctdb_status_dir/$script_name"
1015     case "$1" in
1016         unhealthy|banned)
1017             mkdir -p "$d"
1018             cat "$2" >"$d/$1"
1019             ;;
1020         *)
1021             for i in "banned" "unhealthy" ; do
1022                 rm -f "$d/$i"
1023             done
1024             ;;
1025     esac
1026 }
1027
1028 ##################################################################
1029 # Reconfigure a service on demand
1030
1031 _ctdb_service_reconfigure_common ()
1032 {
1033     _d="$ctdb_status_dir/${service_name}"
1034     mkdir -p "$_d"
1035     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1036 }
1037
1038 ctdb_service_needs_reconfigure ()
1039 {
1040     _ctdb_service_reconfigure_common
1041     [ -e "$_ctdb_service_reconfigure_flag" ]
1042 }
1043
1044 ctdb_service_set_reconfigure ()
1045 {
1046     _ctdb_service_reconfigure_common
1047     >"$_ctdb_service_reconfigure_flag"
1048 }
1049
1050 ctdb_service_unset_reconfigure ()
1051 {
1052     _ctdb_service_reconfigure_common
1053     rm -f "$_ctdb_service_reconfigure_flag"
1054 }
1055
1056 ctdb_service_reconfigure ()
1057 {
1058     echo "Reconfiguring service \"${service_name}\"..."
1059     ctdb_service_unset_reconfigure
1060     service_reconfigure || return $?
1061     ctdb_counter_init
1062 }
1063
1064 # Default service_reconfigure() function does nothing.
1065 service_reconfigure ()
1066 {
1067     :
1068 }
1069
1070 ctdb_reconfigure_take_lock ()
1071 {
1072     _ctdb_service_reconfigure_common
1073     _lock="${_d}/reconfigure_lock"
1074     mkdir -p "${_lock%/*}" # dirname
1075     touch "$_lock"
1076
1077     (
1078         flock 0
1079         # This is overkill but will work if we need to extend this to
1080         # allow certain events to run multiple times in parallel
1081         # (e.g. takeip) and write multiple PIDs to the file.
1082         read _locker_event
1083         if [ -n "$_locker_event" ] ; then
1084             while read _pid ; do
1085                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1086                     kill -0 "$_pid" 2>/dev/null ; then
1087                     exit 1
1088                 fi
1089             done
1090         fi
1091
1092         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1093         exit 0
1094     ) <"$_lock"
1095 }
1096
1097 ctdb_reconfigure_release_lock ()
1098 {
1099     _ctdb_service_reconfigure_common
1100     _lock="${_d}/reconfigure_lock"
1101
1102     rm -f "$_lock"
1103 }
1104
1105 ctdb_replay_monitor_status ()
1106 {
1107     echo "Replaying previous status for this script due to reconfigure..."
1108     # Leading colon (':') is missing in some versions...
1109     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1110     # Output looks like this:
1111     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1112     # This is the cheapest way of getting fields in the middle.
1113     set -- $(IFS=":" ; echo $_out)
1114     _code="$3"
1115     _status="$4"
1116     # The error output field can include colons so we'll try to
1117     # preserve them.  The weak checking at the beginning tries to make
1118     # this work for both broken (no leading ':') and fixed output.
1119     _out="${_out%:}"
1120     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1121     case "$_status" in
1122         OK) : ;;  # Do nothing special.
1123         TIMEDOUT)
1124             # Recast this as an error, since we can't exit with the
1125             # correct negative number.
1126             _code=1
1127             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1128             ;;
1129         DISABLED)
1130             # Recast this as an OK, since we can't exit with the
1131             # correct negative number.
1132             _code=0
1133             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1134             ;;
1135         *) : ;;  # Must be ERROR, do nothing special.
1136     esac
1137     if [ -n "$_err_out" ] ; then
1138         echo "$_err_out"
1139     fi
1140     exit $_code
1141 }
1142
1143 ctdb_service_check_reconfigure ()
1144 {
1145     assert_service_name
1146
1147     # We only care about some events in this function.  For others we
1148     # return now.
1149     case "$event_name" in
1150         monitor|ipreallocated|reconfigure) : ;;
1151         *) return 0 ;;
1152     esac
1153
1154     if ctdb_reconfigure_take_lock ; then
1155         # No events covered by this function are running, so proceed
1156         # with gay abandon.
1157         case "$event_name" in
1158             reconfigure)
1159                 (ctdb_service_reconfigure)
1160                 exit $?
1161                 ;;
1162             ipreallocated)
1163                 if ctdb_service_needs_reconfigure ; then
1164                     ctdb_service_reconfigure
1165                 fi
1166                 ;;
1167         esac
1168
1169         ctdb_reconfigure_release_lock
1170     else
1171         # Somebody else is running an event we don't want to collide
1172         # with.  We proceed with caution.
1173         case "$event_name" in
1174             reconfigure)
1175                 # Tell whoever called us to retry.
1176                 exit 2
1177                 ;;
1178             ipreallocated)
1179                 # Defer any scheduled reconfigure and just run the
1180                 # rest of the ipreallocated event, as per the
1181                 # eventscript.  There's an assumption here that the
1182                 # event doesn't depend on any scheduled reconfigure.
1183                 # This is true in the current code.
1184                 return 0
1185                 ;;
1186             monitor)
1187                 # There is most likely a reconfigure in progress so
1188                 # the service is possibly unstable.  As above, we
1189                 # defer any scheduled reconfigured.  We also replay
1190                 # the previous monitor status since that's the best
1191                 # information we have.
1192                 ctdb_replay_monitor_status
1193                 ;;
1194         esac
1195     fi
1196 }
1197
1198 ##################################################################
1199 # Does CTDB manage this service? - and associated auto-start/stop
1200
1201 ctdb_compat_managed_service ()
1202 {
1203     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1204         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1205     fi
1206 }
1207
1208 is_ctdb_managed_service ()
1209 {
1210     assert_service_name
1211
1212     # $t is used just for readability and to allow better accurate
1213     # matching via leading/trailing spaces
1214     t=" $CTDB_MANAGED_SERVICES "
1215
1216     # Return 0 if "<space>$service_name<space>" appears in $t
1217     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1218         return 0
1219     fi
1220
1221     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1222     # backward compatibility and try again.
1223     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1224     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1225     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1226     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1227     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1228     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1229     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1230     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1231     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1232
1233     t=" $CTDB_MANAGED_SERVICES "
1234
1235     # Return 0 if "<space>$service_name<space>" appears in $t
1236     [ "${t#* ${service_name} }" != "${t}" ]
1237 }
1238
1239 ctdb_start_stop_service ()
1240 {
1241     assert_service_name
1242
1243     # Allow service-start/service-stop pseudo-events to start/stop
1244     # services when we're not auto-starting/stopping and we're not
1245     # monitoring.
1246     case "$event_name" in
1247         service-start)
1248             if is_ctdb_managed_service ; then
1249                 die 'service-start event not permitted when service is managed'
1250             fi
1251             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1252                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1253             fi
1254             ctdb_service_start
1255             exit $?
1256             ;;
1257         service-stop)
1258             if is_ctdb_managed_service ; then
1259                 die 'service-stop event not permitted when service is managed'
1260             fi
1261             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1262                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1263             fi
1264             ctdb_service_stop
1265             exit $?
1266             ;;
1267     esac
1268
1269     # Do nothing unless configured to...
1270     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1271
1272     [ "$event_name" = "monitor" ] || return 0
1273
1274     if is_ctdb_managed_service ; then
1275         if ! is_ctdb_previously_managed_service ; then
1276             echo "Starting service \"$service_name\" - now managed"
1277             background_with_logging ctdb_service_start
1278             exit $?
1279         fi
1280     else
1281         if is_ctdb_previously_managed_service ; then
1282             echo "Stopping service \"$service_name\" - no longer managed"
1283             background_with_logging ctdb_service_stop
1284             exit $?
1285         fi
1286     fi
1287 }
1288
1289 ctdb_service_start ()
1290 {
1291     # The service is marked managed if we've ever tried to start it.
1292     ctdb_service_managed
1293
1294     service_start || return $?
1295
1296     ctdb_counter_init
1297     ctdb_check_tcp_init
1298 }
1299
1300 ctdb_service_stop ()
1301 {
1302     ctdb_service_unmanaged
1303     service_stop
1304 }
1305
1306 # Default service_start() and service_stop() functions.
1307
1308 # These may be overridden in an eventscript.
1309 service_start ()
1310 {
1311     service "$service_name" start
1312 }
1313
1314 service_stop ()
1315 {
1316     service "$service_name" stop
1317 }
1318
1319 ##################################################################
1320
1321 ctdb_standard_event_handler ()
1322 {
1323     case "$1" in
1324         status)
1325             ctdb_checkstatus
1326             exit
1327             ;;
1328         setstatus)
1329             shift
1330             ctdb_setstatus "$@"
1331             exit
1332             ;;
1333     esac
1334 }
1335
1336 # iptables doesn't like being re-entered, so flock-wrap it.
1337 iptables()
1338 {
1339         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1340 }
1341
1342 # AIX (and perhaps others?) doesn't have mktemp
1343 if ! which mktemp >/dev/null 2>&1 ; then
1344     mktemp ()
1345     {
1346         _dir=false
1347         if [ "$1" = "-d" ] ; then
1348             _dir=true
1349             shift
1350         fi
1351         _d="${TMPDIR:-/tmp}"
1352         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1353             md5sum | \
1354             sed -e 's@\(..........\).*@\1@')
1355         _t="${_d}/tmp.${_hex10}"
1356         (
1357             umask 077
1358             if $_dir ; then
1359                 mkdir "$_t"
1360             else
1361                 >"$_t"
1362             fi
1363         )
1364         echo "$_t"
1365     }
1366 fi
1367
1368 ########################################################
1369 # tickle handling
1370 ########################################################
1371
1372 update_tickles ()
1373 {
1374         _port="$1"
1375
1376         tickledir="$CTDB_VARDIR/state/tickles"
1377         mkdir -p "$tickledir"
1378
1379         # Who am I?
1380         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1381
1382         # What public IPs do I hold?
1383         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1384
1385         # IPs as a regexp choice
1386         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1387
1388         # Record connections to our public IPs in a temporary file
1389         _my_connections="${tickledir}/${_port}.connections"
1390         rm -f "$_my_connections"
1391         netstat -tn |
1392         awk -v destpat="^${_ipschoice}:${_port}\$" \
1393           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1394         sort >"$_my_connections"
1395
1396         # Record our current tickles in a temporary file
1397         _my_tickles="${tickledir}/${_port}.tickles"
1398         rm -f "$_my_tickles"
1399         for _i in $_ips ; do
1400                 ctdb -Y gettickles $_i $_port |
1401                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1402         done |
1403         sort >"$_my_tickles"
1404
1405         # Add tickles for connections that we haven't already got tickles for
1406         comm -23 "$_my_connections" "$_my_tickles" |
1407         while read _src _dst ; do
1408                 ctdb addtickle $_src $_dst
1409         done
1410
1411         # Remove tickles for connections that are no longer there
1412         comm -13 "$_my_connections" "$_my_tickles" |
1413         while read _src _dst ; do
1414                 ctdb deltickle $_src $_dst
1415         done
1416
1417         rm -f "$_my_connections" "$_my_tickles"
1418 }
1419
1420 ########################################################
1421 # load a site local config file
1422 ########################################################
1423
1424 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1425         . "$CTDB_RC_LOCAL"
1426 }
1427
1428 [ -x $CTDB_BASE/rc.local ] && {
1429         . $CTDB_BASE/rc.local
1430 }
1431
1432 [ -d $CTDB_BASE/rc.local.d ] && {
1433         for i in $CTDB_BASE/rc.local.d/* ; do
1434                 [ -x "$i" ] && . "$i"
1435         done
1436 }
1437
1438 script_name="${0##*/}"       # basename
1439 service_fail_limit=1
1440 event_name="$1"