config/functions

   1 # Hey Emacs, this is a -*- shell-script -*- !!!
   2
   3 # utility functions for ctdb event scripts
   4
   5 PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
   6
   7 [ -z "$CTDB_VARDIR" ] && {
   8     export CTDB_VARDIR="/var/ctdb"
   9 }
  10 [ -z "$CTDB_ETCDIR" ] && {
  11     export CTDB_ETCDIR="/etc"
  12 }
  13
  14 #######################################
  15 # pull in a system config file, if any
  16 _loadconfig() {
  17
  18     if [ -z "$1" ] ; then
  19         foo="${service_config:-${service_name}}"
  20         if [ -n "$foo" ] ; then
  21             loadconfig "$foo"
  22         fi
  23     elif [ "$1" != "ctdb" ] ; then
  24         loadconfig "ctdb"
  25     fi
  26
  27     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
  28         . $CTDB_ETCDIR/sysconfig/$1
  29     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
  30         . $CTDB_ETCDIR/default/$1
  31     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
  32         . $CTDB_BASE/sysconfig/$1
  33     fi
  34 }
  35
  36 loadconfig () {
  37     _loadconfig "$@"
  38 }
  39
  40 ##############################################################
  41 # make sure CTDB_CURRENT_DEBUGLEVEL is set to the desired debug level
  42 # (integer)
  43 #
  44 # If it is already set then do nothing, since it might have been set
  45 # via a file in rc.local.d/.  If it is not set then set it by sourcing
  46 # /var/ctdb/eventscript_debuglevel. If this file does not exist then
  47 # create it using output from "ctdb getdebug".  If the option 1st arg
  48 # is "create" then don't source an existing file but create a new one
  49 # instead - this is useful for creating the file just once in each
  50 # event run in 00.ctdb.  If there's a problem getting the debug level
  51 # from ctdb then it is silently set to 0 - no use spamming logs if our
  52 # debug code is broken...
  53 ctdb_set_current_debuglevel ()
  54 {
  55     [ -z "$CTDB_CURRENT_DEBUGLEVEL" ] || return 0
  56
  57     _f="$CTDB_VARDIR/eventscript_debuglevel"
  58
  59     if [ "$1" = "create" -o ! -r "$_f" ] ; then
  60         _t=$(ctdb getdebug -Y 2>/dev/null)
  61         # get last field of output
  62         _t="${_t%:}"
  63         _t="${_t##*:}"
  64         # Defaults to 0
  65         echo "export CTDB_CURRENT_DEBUGLEVEL=\"${_t:-0}\"" >"$_f"
  66     fi
  67
  68     . "$_f"
  69 }
  70
  71 ##############################################################
  72 # determine on what type of system (init style) we are running
  73 detect_init_style() {
  74     # only do detection if not already set:
  75     test "x$CTDB_INIT_STYLE" != "x" && return
  76
  77     if [ -x /sbin/startproc ]; then
  78         CTDB_INIT_STYLE="suse"
  79     elif [ -x /sbin/start-stop-daemon ]; then
  80         CTDB_INIT_STYLE="debian"
  81     else
  82         CTDB_INIT_STYLE="redhat"
  83     fi
  84 }
  85
  86 ######################################################
  87 # simulate /sbin/service on platforms that don't have it
  88 # _service() makes it easier to hook the service() function for
  89 # testing.
  90 _service ()
  91 {
  92   _service_name="$1"
  93   _op="$2"
  94
  95   # do nothing, when no service was specified
  96   [ -z "$_service_name" ] && return
  97
  98   if [ -x /sbin/service ]; then
  99       $_nice /sbin/service "$_service_name" "$_op"
 100   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
 101       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
 102   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
 103       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
 104   fi
 105 }
 106
 107 service()
 108 {
 109     _nice=""
 110     _service "$@"
 111 }
 112
 113 ######################################################
 114 # simulate /sbin/service (niced) on platforms that don't have it
 115 nice_service()
 116 {
 117     _nice="nice"
 118     _service "$@"
 119 }
 120
 121 ######################################################
 122 # wrapper around /proc/ settings to allow them to be hooked
 123 # for testing
 124 # 1st arg is relative path under /proc/, 2nd arg is value to set
 125 set_proc ()
 126 {
 127     echo "$2" >"/proc/$1"
 128 }
 129
 130 ######################################################
 131 # wrapper around getting file contents from /proc/ to allow
 132 # this to be hooked for testing
 133 # 1st arg is relative path under /proc/
 134 get_proc ()
 135 {
 136     cat "/proc/$1"
 137 }
 138
 139 ######################################################
 140 # Check that an RPC service is healthy -
 141 # this includes allowing a certain number of failures
 142 # before marking the NFS service unhealthy.
 143 #
 144 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
 145 #
 146 # each triple is a set of 3 arguments: an operator, a
 147 # fail count limit and an action string.
 148 #
 149 # For example:
 150 #
 151 #       nfs_check_rpc_service "lockd" \
 152 #           -ge 15 "verbose restart unhealthy" \
 153 #           -eq 10 "restart:bs"
 154 #
 155 # says that if lockd is down for 15 iterations then do
 156 # a verbose restart of lockd and mark the node unhealthy.
 157 # Before this, after 10 iterations of failure, the
 158 # service is restarted silently in the background.
 159 # Order is important: the number of failures need to be
 160 # specified in reverse order because processing stops
 161 # after the first condition that is true.
 162 ######################################################
 163 nfs_check_rpc_service ()
 164 {
 165     _prog_name="$1" ; shift
 166
 167     _version=1
 168     _rpc_prog="$_prog_name"
 169     _restart=""
 170     _opts=""
 171     case "$_prog_name" in
 172         knfsd)
 173             _rpc_prog=nfs
 174             _version=3
 175             _restart="echo 'Trying to restart NFS service'"
 176             _restart="${_restart}; startstop_nfs restart"
 177             ;;
 178         mountd)
 179             _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
 180             ;;
 181         rquotad)
 182             _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
 183             ;;
 184         lockd)
 185             _rpc_prog=nlockmgr
 186             _version=4
 187             _restart="echo 'Trying to restart lock manager service'"
 188             _restart="${_restart}; startstop_nfslock restart"
 189             ;;
 190         statd)
 191             _rpc_prog=status
 192             _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
 193             _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
 194             _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
 195             ;;
 196         *)
 197             echo "Internal error: unknown RPC program \"$_prog_name\"."
 198             exit 1
 199     esac
 200
 201     _service_name="nfs_${_prog_name}"
 202
 203     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
 204         ctdb_counter_init "$_service_name"
 205         return 0
 206     fi
 207
 208     ctdb_counter_incr "$_service_name"
 209
 210     while [ -n "$3" ] ; do
 211         ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
 212             for _action in $3 ; do
 213                 case "$_action" in
 214                     verbose)
 215                         echo "$ctdb_check_rpc_out"
 216                         ;;
 217                     restart|restart:*)
 218                         # No explicit command specified, construct rpc command.
 219                         if [ -z "$_restart" ] ; then
 220                             _p="rpc.${_prog_name}"
 221                             _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
 222                             _restart="${_restart}; killall -q -9 $_p"
 223                             _restart="${_restart}; $_p $_opts"
 224                         fi
 225
 226                         # Process restart flags...
 227                         _flags="${_action#restart:}"
 228                         # There may not have been a colon...
 229                         [ "$_flags" != "$_action" ] || _flags=""
 230                         # q=quiet - everything to /dev/null
 231                         if [ "${_flags#*q}" != "$_flags" ] ; then
 232                             _restart="{ ${_restart} ; } >/dev/null 2>&1"
 233                         fi
 234                         # s=stealthy - last command to /dev/null
 235                         if [ "${_flags#*s}" != "$_flags" ] ; then
 236                             _restart="${_restart} >/dev/null 2>&1"
 237                         fi
 238                         # b=background - the whole thing, easy and reliable
 239                         if [ "${_flags#*b}" != "$_flags" ] ; then
 240                             _restart="{ ${_restart} ; } &"
 241                         fi
 242
 243                         # Do it!
 244                         eval "${_restart}"
 245                         ;;
 246                     unhealthy)
 247                         exit 1
 248                         ;;
 249                     *)
 250                         echo "Internal error: unknown action \"$_action\"."
 251                         exit 1
 252                 esac
 253             done
 254
 255             # Only process the first action group.
 256             break
 257         }
 258         shift 3
 259     done
 260 }
 261
 262 ######################################################
 263 # check that a rpc server is registered with portmap
 264 # and responding to requests
 265 # usage: ctdb_check_rpc SERVICE_NAME VERSION
 266 ######################################################
 267 ctdb_check_rpc ()
 268 {
 269     progname="$1"
 270     version="$2"
 271
 272     if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
 273         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 274 $ctdb_check_rpc_out"
 275         echo "$ctdb_check_rpc_out"
 276         return 1
 277     fi
 278 }
 279
 280 ######################################################
 281 # check a set of directories is available
 282 # return 1 on a missing directory
 283 # usage: ctdb_check_directories_probe SERVICE_NAME <directories...>
 284 ######################################################
 285 ctdb_check_directories_probe() {
 286     while IFS="" read d ; do
 287         case "$d" in
 288             *%*)
 289                 continue
 290                 ;;
 291             *)
 292                 [ -d "${d}/." ] || return 1
 293         esac
 294     done
 295 }
 296
 297 ######################################################
 298 # check a set of directories is available
 299 # usage: ctdb_check_directories SERVICE_NAME <directories...>
 300 ######################################################
 301 ctdb_check_directories() {
 302     n="${1:-${service_name}}"
 303     ctdb_check_directories_probe || {
 304         echo "ERROR: $n directory \"$d\" not available"
 305         exit 1
 306     }
 307 }
 308
 309 ######################################################
 310 # check a set of tcp ports
 311 # usage: ctdb_check_tcp_ports <ports...>
 312 ######################################################
 313 ctdb_check_tcp_ports() {
 314
 315     for p ; do
 316         if ! netstat -a -t -n | grep -q "0\.0\.0\.0:$p .*LISTEN" ; then
 317             if ! netstat -a -t -n | grep -q ":::$p .*LISTEN" ; then
 318                 echo "ERROR: $service_name tcp port $p is not responding"
 319                 return 1
 320             fi
 321         fi
 322     done
 323 }
 324
 325 ######################################################
 326 # check a unix socket
 327 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
 328 ######################################################
 329 ctdb_check_unix_socket() {
 330     socket_path="$1"
 331     [ -z "$socket_path" ] && return
 332
 333     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
 334         echo "ERROR: $service_name socket $socket_path not found"
 335         return 1
 336     fi
 337 }
 338
 339 ######################################################
 340 # check a command returns zero status
 341 # usage: ctdb_check_command SERVICE_NAME <command>
 342 ######################################################
 343 ctdb_check_command() {
 344   service_name="$1"
 345   wait_cmd="$2"
 346   [ -z "$wait_cmd" ] && return;
 347   $wait_cmd > /dev/null 2>&1 || {
 348       echo "ERROR: $service_name - $wait_cmd returned error"
 349       exit 1
 350   }
 351 }
 352
 353 ################################################
 354 # kill off any TCP connections with the given IP
 355 ################################################
 356 kill_tcp_connections() {
 357     _IP="$1"
 358     _failed=0
 359
 360     _killcount=0
 361     connfile="$CTDB_VARDIR/state/connections.$_IP"
 362     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
 363     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 364
 365     while read dest src; do
 366         srcip=`echo $src | sed -e "s/:[^:]*$//"`
 367         srcport=`echo $src | sed -e "s/^.*://"`
 368         destip=`echo $dest | sed -e "s/:[^:]*$//"`
 369         destport=`echo $dest | sed -e "s/^.*://"`
 370         echo "Killing TCP connection $srcip:$srcport $destip:$destport"
 371         ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
 372         case $destport in
 373           # we only do one-way killtcp for CIFS
 374           139|445) : ;;
 375           # for all others we do 2-way
 376           *)
 377                 ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1
 378                 ;;
 379         esac
 380         _killcount=`expr $_killcount + 1`
 381      done < $connfile
 382     rm -f $connfile
 383
 384     [ $_failed = 0 ] || {
 385         echo "Failed to send killtcp control"
 386         return;
 387     }
 388     [ $_killcount -gt 0 ] || {
 389         return;
 390     }
 391     _count=0
 392     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
 393         sleep 1
 394         _count=`expr $_count + 1`
 395         [ $_count -gt 3 ] && {
 396             echo "Timed out killing tcp connections for IP $_IP"
 397             return;
 398         }
 399     done
 400     echo "killed $_killcount TCP connections to released IP $_IP"
 401 }
 402
 403 ##################################################################
 404 # kill off the local end for any TCP connections with the given IP
 405 ##################################################################
 406 kill_tcp_connections_local_only() {
 407     _IP="$1"
 408     _failed=0
 409
 410     _killcount=0
 411     connfile="$CTDB_VARDIR/state/connections.$_IP"
 412     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
 413     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 414
 415     while read dest src; do
 416         srcip=`echo $src | sed -e "s/:[^:]*$//"`
 417         srcport=`echo $src | sed -e "s/^.*://"`
 418         destip=`echo $dest | sed -e "s/:[^:]*$//"`
 419         destport=`echo $dest | sed -e "s/^.*://"`
 420         echo "Killing TCP connection $srcip:$srcport $destip:$destport"
 421         ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
 422         _killcount=`expr $_killcount + 1`
 423      done < $connfile
 424     rm -f $connfile
 425
 426     [ $_failed = 0 ] || {
 427         echo "Failed to send killtcp control"
 428         return;
 429     }
 430     [ $_killcount -gt 0 ] || {
 431         return;
 432     }
 433     _count=0
 434     while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do
 435         sleep 1
 436         _count=`expr $_count + 1`
 437         [ $_count -gt 3 ] && {
 438             echo "Timed out killing tcp connections for IP $_IP"
 439             return;
 440         }
 441     done
 442     echo "killed $_killcount TCP connections to released IP $_IP"
 443 }
 444
 445 ##################################################################
 446 # tickle any TCP connections with the given IP
 447 ##################################################################
 448 tickle_tcp_connections() {
 449     _IP="$1"
 450     _failed=0
 451
 452     _killcount=0
 453     connfile="$CTDB_VARDIR/state/connections.$_IP"
 454     netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile
 455     netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile
 456
 457     while read dest src; do
 458         srcip=`echo $src | sed -e "s/:[^:]*$//"`
 459         srcport=`echo $src | sed -e "s/^.*://"`
 460         destip=`echo $dest | sed -e "s/:[^:]*$//"`
 461         destport=`echo $dest | sed -e "s/^.*://"`
 462         echo "Tickle TCP connection $srcip:$srcport $destip:$destport"
 463         ctdb tickle $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1
 464         echo "Tickle TCP connection $destip:$destport $srcip:$srcport"
 465         ctdb tickle $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1
 466      done < $connfile
 467     rm -f $connfile
 468
 469     [ $_failed = 0 ] || {
 470         echo "Failed to send tickle control"
 471         return;
 472     }
 473 }
 474
 475 ########################################################
 476 # start/stop the nfs service on different platforms
 477 ########################################################
 478 startstop_nfs() {
 479         PLATFORM="unknown"
 480         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 481                 PLATFORM="sles"
 482         }
 483         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
 484                 PLATFORM="rhel"
 485         }
 486
 487         case $PLATFORM in
 488         sles)
 489                 case $1 in
 490                 start)
 491                         service nfsserver start
 492                         ;;
 493                 stop)
 494                         service nfsserver stop > /dev/null 2>&1
 495                         ;;
 496                 restart)
 497                         set_proc "fs/nfsd/threads" 0
 498                         service nfsserver stop > /dev/null 2>&1
 499                         pkill -9 nfsd
 500                         service nfsserver start
 501                         ;;
 502                 esac
 503                 ;;
 504         rhel)
 505                 case $1 in
 506                 start)
 507                         service nfslock start
 508                         service nfs start
 509                         ;;
 510                 stop)
 511                         service nfs stop
 512                         service nfslock stop
 513                         ;;
 514                 restart)
 515                         set_proc "fs/nfsd/threads" 0
 516                         service nfs stop > /dev/null 2>&1
 517                         service nfslock stop > /dev/null 2>&1
 518                         pkill -9 nfsd
 519                         service nfslock start
 520                         service nfs start
 521                         ;;
 522                 esac
 523                 ;;
 524         *)
 525                 echo "Unknown platform. NFS is not supported with ctdb"
 526                 exit 1
 527                 ;;
 528         esac
 529 }
 530
 531 ########################################################
 532 # start/stop the nfs lockmanager service on different platforms
 533 ########################################################
 534 startstop_nfslock() {
 535         PLATFORM="unknown"
 536         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 537                 PLATFORM="sles"
 538         }
 539         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
 540                 PLATFORM="rhel"
 541         }
 542
 543         case $PLATFORM in
 544         sles)
 545                 # for sles there is no service for lockmanager
 546                 # so we instead just shutdown/restart nfs
 547                 case $1 in
 548                 start)
 549                         service nfsserver start
 550                         ;;
 551                 stop)
 552                         service nfsserver stop > /dev/null 2>&1
 553                         ;;
 554                 restart)
 555                         service nfsserver stop
 556                         service nfsserver start
 557                         ;;
 558                 esac
 559                 ;;
 560         rhel)
 561                 case $1 in
 562                 start)
 563                         service nfslock start
 564                         ;;
 565                 stop)
 566                         service nfslock stop > /dev/null 2>&1
 567                         ;;
 568                 restart)
 569                         service nfslock stop
 570                         service nfslock start
 571                         ;;
 572                 esac
 573                 ;;
 574         *)
 575                 echo "Unknown platform. NFS locking is not supported with ctdb"
 576                 exit 1
 577                 ;;
 578         esac
 579 }
 580
 581 add_ip_to_iface()
 582 {
 583         local _iface=$1
 584         local _ip=$2
 585         local _maskbits=$3
 586         local _state_dir="$CTDB_VARDIR/state/interface_modify"
 587         local _lockfile="$_state_dir/$_iface.flock"
 588         local _readd_base="$_state_dir/$_iface.readd.d"
 589
 590         mkdir -p $_state_dir || {
 591                 ret=$?
 592                 echo "Failed to mkdir -p $_state_dir - $ret"
 593                 return $ret
 594         }
 595
 596         test -f $_lockfile || {
 597                 touch $_lockfile
 598         }
 599
 600         flock --timeout 30 $_lockfile $CTDB_BASE/interface_modify.sh add "$_iface" "$_ip" "$_maskbits" "$_readd_base"
 601         return $?
 602 }
 603
 604 delete_ip_from_iface()
 605 {
 606         local _iface=$1
 607         local _ip=$2
 608         local _maskbits=$3
 609         local _state_dir="$CTDB_VARDIR/state/interface_modify"
 610         local _lockfile="$_state_dir/$_iface.flock"
 611         local _readd_base="$_state_dir/$_iface.readd.d"
 612
 613         mkdir -p $_state_dir || {
 614                 ret=$?
 615                 echo "Failed to mkdir -p $_state_dir - $ret"
 616                 return $ret
 617         }
 618
 619         test -f $_lockfile || {
 620                 touch $_lockfile
 621         }
 622
 623         flock --timeout 30 $_lockfile $CTDB_BASE/interface_modify.sh delete "$_iface" "$_ip" "$_maskbits" "$_readd_base"
 624         return $?
 625 }
 626
 627 setup_iface_ip_readd_script()
 628 {
 629         local _iface=$1
 630         local _ip=$2
 631         local _maskbits=$3
 632         local _readd_script=$4
 633         local _state_dir="$CTDB_VARDIR/state/interface_modify"
 634         local _lockfile="$_state_dir/$_iface.flock"
 635         local _readd_base="$_state_dir/$_iface.readd.d"
 636
 637         mkdir -p $_state_dir || {
 638                 ret=$?
 639                 echo "Failed to mkdir -p $_state_dir - $ret"
 640                 return $ret
 641         }
 642
 643         test -f $_lockfile || {
 644                 touch $_lockfile
 645         }
 646
 647         flock --timeout 30 $_lockfile $CTDB_BASE/interface_modify.sh readd_script "$_iface" "$_ip" "$_maskbits" "$_readd_base" "$_readd_script"
 648         return $?
 649 }
 650
 651 ########################################################
 652 # some simple logic for counting events - per eventscript
 653 # usage: ctdb_counter_init
 654 #        ctdb_counter_incr
 655 #        ctdb_check_counter_limit <limit>
 656 # ctdb_check_counter_limit succeeds when count >= <limit>
 657 ########################################################
 658 _ctdb_counter_common () {
 659     _service_name="${1:-${service_name}}"
 660     _counter_file="$ctdb_fail_dir/$_service_name"
 661     mkdir -p "${_counter_file%/*}" # dirname
 662 }
 663 ctdb_counter_init () {
 664     _ctdb_counter_common "$1"
 665
 666     >"$_counter_file"
 667 }
 668 ctdb_counter_incr () {
 669     _ctdb_counter_common "$1"
 670
 671     # unary counting!
 672     echo -n 1 >> "$_counter_file"
 673 }
 674 ctdb_check_counter_limit () {
 675     _ctdb_counter_common
 676
 677     _limit="${1:-${service_fail_limit}}"
 678     _quiet="$2"
 679
 680     # unary counting!
 681     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 682     if [ $_size -ge $_limit ] ; then
 683         echo "ERROR: more than $_limit consecutive failures for $service_name, marking cluster unhealthy"
 684         exit 1
 685     elif [ $_size -gt 0 -a -z "$_quiet" ] ; then
 686         echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet"
 687     fi
 688 }
 689 ctdb_check_counter_equal () {
 690     _ctdb_counter_common
 691
 692     _limit=$1
 693
 694     # unary counting!
 695     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 696     if [ $_size -eq $_limit ] ; then
 697         return 1
 698     fi
 699     return 0
 700 }
 701 ctdb_check_counter () {
 702     _msg="${1:-error}"  # "error"  - anything else is silent on fail
 703     _op="${2:--ge}"  # an integer operator supported by test
 704     _limit="${3:-${service_fail_limit}}"
 705     shift 3
 706     _ctdb_counter_common "$1"
 707
 708     # unary counting!
 709     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 710     if [ $_size $_op $_limit ] ; then
 711         if [ "$_msg" = "error" ] ; then
 712             echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy"
 713             exit 1
 714         else
 715             return 1
 716         fi
 717     fi
 718 }
 719
 720 ########################################################
 721
 722 ctdb_status_dir="$CTDB_VARDIR/status"
 723 ctdb_fail_dir="$CTDB_VARDIR/failcount"
 724
 725 ctdb_setup_service_state_dir ()
 726 {
 727     service_state_dir="$CTDB_VARDIR/state/${1:-${service_name}}"
 728     mkdir -p "$service_state_dir" || {
 729         echo "Error creating state dir \"$service_state_dir\""
 730         exit 1
 731     }
 732 }
 733
 734 ########################################################
 735 # Managed status history, for auto-start/stop
 736
 737 ctdb_managed_dir="$CTDB_VARDIR/managed_history"
 738
 739 _ctdb_managed_common ()
 740 {
 741     _service_name="${1:-${service_name}}"
 742     _ctdb_managed_file="$ctdb_managed_dir/$_service_name"
 743 }
 744
 745 ctdb_service_managed ()
 746 {
 747     _ctdb_managed_common "$@"
 748     mkdir -p "$ctdb_managed_dir"
 749     touch "$_ctdb_managed_file"
 750 }
 751
 752 ctdb_service_unmanaged ()
 753 {
 754     _ctdb_managed_common "$@"
 755     rm -f "$_ctdb_managed_file"
 756 }
 757
 758 is_ctdb_previously_managed_service ()
 759 {
 760     _ctdb_managed_common "$@"
 761     [ -f "$_ctdb_managed_file" ]
 762 }
 763
 764 ########################################################
 765 # Check and set status
 766
 767 log_status_cat ()
 768 {
 769     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
 770 }
 771
 772 ctdb_checkstatus ()
 773 {
 774     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
 775         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
 776         return 1
 777     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
 778         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
 779         return 2
 780     else
 781         return 0
 782     fi
 783 }
 784
 785 ctdb_setstatus ()
 786 {
 787     d="$ctdb_status_dir/$script_name"
 788     case "$1" in
 789         unhealthy|banned)
 790             mkdir -p "$d"
 791             cat "$2" >"$d/$1"
 792             ;;
 793         *)
 794             for i in "banned" "unhealthy" ; do
 795                 rm -f "$d/$i"
 796             done
 797             ;;
 798     esac
 799 }
 800
 801 ##################################################################
 802 # Reconfigure a service on demand
 803
 804 _ctdb_service_reconfigure_common ()
 805 {
 806     _d="$ctdb_status_dir/${1:-${service_name}}"
 807     mkdir -p "$_d"
 808     _ctdb_service_reconfigure_flag="$_d/reconfigure"
 809 }
 810
 811 ctdb_service_needs_reconfigure ()
 812 {
 813     _ctdb_service_reconfigure_common "$@"
 814     [ -e "$_ctdb_service_reconfigure_flag" ]
 815 }
 816
 817 ctdb_service_set_reconfigure ()
 818 {
 819     _ctdb_service_reconfigure_common "$@"
 820     >"$_ctdb_service_reconfigure_flag"
 821 }
 822
 823 ctdb_service_unset_reconfigure ()
 824 {
 825     _ctdb_service_reconfigure_common "$@"
 826     rm -f "$_ctdb_service_reconfigure_flag"
 827 }
 828
 829 ctdb_service_reconfigure ()
 830 {
 831     echo "Reconfiguring service \"$service_name\"..."
 832     ctdb_service_unset_reconfigure "$@"
 833     service_reconfigure "$@" || return $?
 834     ctdb_counter_init "$@"
 835 }
 836
 837 # Default service_reconfigure() function.
 838 service_reconfigure ()
 839 {
 840     service "${1:-$service_name}" restart
 841 }
 842
 843 ctdb_service_check_reconfigure ()
 844 {
 845     # Only do this for certain events.
 846     case "$event_name" in
 847         monitor|ipreallocated) : ;;
 848         *) return 0
 849     esac
 850
 851     if ctdb_service_needs_reconfigure "$@" ; then
 852         ctdb_service_reconfigure "$@"
 853
 854         # Fall through to non-monitor events.
 855         [ "$event_name" = "monitor" ] || return 0
 856
 857         # We don't want to proceed with the rest of the monitor event
 858         # here, so we exit.  However, if we exit 0 then, if the
 859         # service was previously broken, we might return a false
 860         # positive.  So we simply retrieve the status of this script
 861         # from the previous monitor loop and exit with that status.
 862         ctdb scriptstatus | \
 863             grep -q -E "^${script_name}[[:space:]]+Status:OK[[:space:]]"
 864         exit $?
 865     fi
 866 }
 867
 868 ##################################################################
 869 # Does CTDB manage this service? - and associated auto-start/stop
 870
 871 ctdb_compat_managed_service ()
 872 {
 873     if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then
 874         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
 875     fi
 876 }
 877
 878 is_ctdb_managed_service ()
 879 {
 880     _service_name="${1:-${service_name}}"
 881
 882     # $t is used just for readability and to allow better accurate
 883     # matching via leading/trailing spaces
 884     t=" $CTDB_MANAGED_SERVICES "
 885
 886     # Return 0 if "<space>$_service_name<space>" appears in $t
 887     if [ "${t#* ${_service_name} }" != "${t}" ] ; then
 888         return 0
 889     fi
 890
 891     # If above didn't match then update $CTDB_MANAGED_SERVICES for
 892     # backward compatibility and try again.
 893     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
 894     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
 895     ctdb_compat_managed_service "$CTDB_MANAGES_SCP"      "scp"
 896     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
 897     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
 898     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
 899     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
 900     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
 901     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
 902
 903     t=" $CTDB_MANAGED_SERVICES "
 904
 905     # Return 0 if "<space>$_service_name<space>" appears in $t
 906     [ "${t#* ${_service_name} }" != "${t}" ]
 907 }
 908
 909 ctdb_start_stop_service ()
 910 {
 911     _service_name="${1:-${service_name}}"
 912
 913     [ "$event_name" = "monitor" ] || return 0
 914
 915     if is_ctdb_managed_service "$_service_name" ; then
 916         if ! is_ctdb_previously_managed_service "$_service_name" ; then
 917             echo "Starting service \"$_service_name\" - now managed"
 918             ctdb_service_start "$_service_name"
 919             exit $?
 920         fi
 921     else
 922         if is_ctdb_previously_managed_service "$_service_name" ; then
 923             echo "Stopping service \"$_service_name\" - no longer managed"
 924             ctdb_service_stop "$_service_name"
 925             exit $?
 926         fi
 927     fi
 928 }
 929
 930 ctdb_service_start ()
 931 {
 932     # The service is marked managed if we've ever tried to start it.
 933     ctdb_service_managed "$@"
 934
 935     # Here we only want $1.  If no argument is passed then
 936     # service_start needs to know.
 937     service_start "$@" || return $?
 938
 939     ctdb_counter_init "$@"
 940 }
 941
 942 ctdb_service_stop ()
 943 {
 944     ctdb_service_unmanaged "$@"
 945     service_stop "$@"
 946 }
 947
 948 # Default service_start() and service_stop() functions.
 949
 950 # These may be overridden in an eventscript.  When overriding, the
 951 # following convention must be followed.  If these functions are
 952 # called with no arguments then they may use internal logic to
 953 # determine whether the service is managed and, therefore, whether
 954 # they should take any action.  However, if the service name is
 955 # specified as an argument then an attempt must be made to start or
 956 # stop the service.  This is because the auto-start/stop code calls
 957 # them with the service name as an argument.
 958 service_start ()
 959 {
 960     service "${1:-${service_name}}" start
 961 }
 962
 963 service_stop ()
 964 {
 965     service "${1:-${service_name}}" stop
 966 }
 967
 968 ##################################################################
 969
 970 ctdb_standard_event_handler ()
 971 {
 972     case "$1" in
 973         status)
 974             ctdb_checkstatus
 975             exit
 976             ;;
 977         setstatus)
 978             shift
 979             ctdb_setstatus "$@"
 980             exit
 981             ;;
 982     esac
 983 }
 984
 985 ipv4_host_addr_to_net_addr()
 986 {
 987         local HOST=$1
 988         local MASKBITS=$2
 989
 990         local HOST0=$(echo $HOST | awk -F . '{print $4}')
 991         local HOST1=$(echo $HOST | awk -F . '{print $3}')
 992         local HOST2=$(echo $HOST | awk -F . '{print $2}')
 993         local HOST3=$(echo $HOST | awk -F . '{print $1}')
 994
 995         local HOST_NUM=$(( $HOST0 + $HOST1 * 256 + $HOST2 * (256 ** 2) + $HOST3 * (256 ** 3) ))
 996
 997         local MASK_NUM=$(( ( (2**32 - 1) * (2**(32 - $MASKBITS)) ) & (2**32 - 1) ))
 998
 999         local NET_NUM=$(( $HOST_NUM & $MASK_NUM))
1000
1001         local NET0=$(( $NET_NUM & 255 ))
1002         local NET1=$(( ($NET_NUM & (255 * 256)) / 256 ))
1003         local NET2=$(( ($NET_NUM & (255 * 256**2)) / 256**2 ))
1004         local NET3=$(( ($NET_NUM & (255 * 256**3)) / 256**3 ))
1005
1006         echo "$NET3.$NET2.$NET1.$NET0"
1007 }
1008
1009 ipv4_maskbits_to_net_mask()
1010 {
1011         local MASKBITS=$1
1012
1013         local MASK_NUM=$(( ( (2**32 - 1) * (2**(32 - $MASKBITS)) ) & (2**32 - 1) ))
1014
1015         local MASK0=$(( $MASK_NUM & 255 ))
1016         local MASK1=$(( ($MASK_NUM & (255 * 256)) / 256 ))
1017         local MASK2=$(( ($MASK_NUM & (255 * 256**2)) / 256**2 ))
1018         local MASK3=$(( ($MASK_NUM & (255 * 256**3)) / 256**3 ))
1019
1020         echo "$MASK3.$MASK2.$MASK1.$MASK0"
1021 }
1022
1023 ipv4_is_valid_addr()
1024 {
1025         local ADDR=$1
1026         local fail=0
1027
1028         local N=`echo $ADDR | sed -e 's/[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*//'`
1029         test -n "$N" && fail=1
1030
1031         local ADDR0=$(echo $ADDR | awk -F . '{print $4}')
1032         local ADDR1=$(echo $ADDR | awk -F . '{print $3}')
1033         local ADDR2=$(echo $ADDR | awk -F . '{print $2}')
1034         local ADDR3=$(echo $ADDR | awk -F . '{print $1}')
1035
1036         test "$ADDR0" -gt 255 && fail=1
1037         test "$ADDR1" -gt 255 && fail=1
1038         test "$ADDR2" -gt 255 && fail=1
1039         test "$ADDR3" -gt 255 && fail=1
1040
1041         test x"$fail" != x"0" && {
1042                 #echo "IPv4: '$ADDR' is not a valid address"
1043                 return 1;
1044         }
1045
1046         return 0;
1047 }
1048
1049 # iptables doesn't like being re-entered, so flock-wrap it.
1050 iptables()
1051 {
1052         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1053 }
1054
1055 ########################################################
1056 # tickle handling
1057 ########################################################
1058
1059 # Temporary directory for tickles.
1060 tickledir="$CTDB_VARDIR/state/tickles"
1061 mkdir -p "$tickledir"
1062
1063 update_tickles ()
1064 {
1065         _port="$1"
1066
1067         mkdir -p "$tickledir" # Just in case
1068
1069         # Who am I?
1070         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1071
1072         # What public IPs do I hold?
1073         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1074
1075         # IPs as a regexp choice
1076         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1077
1078         # Record connections to our public IPs in a temporary file
1079         _my_connections="${tickledir}/${_port}.connections"
1080         rm -f "$_my_connections"
1081         netstat -tn |
1082         awk -v destpat="^${_ipschoice}:${_port}\$" \
1083           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1084         sort >"$_my_connections"
1085
1086         # Record our current tickles in a temporary file
1087         _my_tickles="${tickledir}/${_port}.tickles"
1088         rm -f "$_my_tickles"
1089         for _i in $_ips ; do
1090                 ctdb -Y gettickles $_i $_port |
1091                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1092         done |
1093         sort >"$_my_tickles"
1094
1095         # Add tickles for connections that we haven't already got tickles for
1096         comm -23 "$_my_connections" "$_my_tickles" |
1097         while read _src _dst ; do
1098                 ctdb addtickle $_src $_dst
1099         done
1100
1101         # Remove tickles for connections that are no longer there
1102         comm -13 "$_my_connections" "$_my_tickles" |
1103         while read _src _dst ; do
1104                 ctdb deltickle $_src $_dst
1105         done
1106
1107         rm -f "$_my_connections" "$_my_tickles"
1108 }
1109
1110 ########################################################
1111 # load a site local config file
1112 ########################################################
1113
1114 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1115         . "$CTDB_RC_LOCAL"
1116 }
1117
1118 [ -x $CTDB_BASE/rc.local ] && {
1119         . $CTDB_BASE/rc.local
1120 }
1121
1122 [ -d $CTDB_BASE/rc.local.d ] && {
1123         for i in $CTDB_BASE/rc.local.d/* ; do
1124                 [ -x "$i" ] && . "$i"
1125         done
1126 }
1127
1128 script_name="${0##*/}"       # basename
1129 service_name="$script_name"  # default is just the script name
1130 service_fail_limit=1
1131 event_name="$1"