tests/scripts/ctdb_test_functions.bash

   1 # Hey Emacs, this is a -*- shell-script -*- !!!  :-)
   2
   3 fail ()
   4 {
   5     echo "$*"
   6     exit 1
   7 }
   8
   9 ######################################################################
  10
  11 ctdb_test_begin ()
  12 {
  13     local name="$1"
  14
  15     teststarttime=$(date '+%s')
  16     testduration=0
  17
  18     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  19     echo "Running test $name ($(date '+%T'))"
  20     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  21 }
  22
  23 ctdb_test_end ()
  24 {
  25     local name="$1" ; shift
  26     local status="$1" ; shift
  27     # "$@" is command-line
  28
  29     local interp="SKIPPED"
  30     local statstr=" (reason $*)"
  31     if [ -n "$status" ] ; then
  32         if [ $status -eq 0 ] ; then
  33             interp="PASSED"
  34             statstr=""
  35             echo "ALL OK: $*"
  36         else
  37             interp="FAILED"
  38             statstr=" (status $status)"
  39             testfailures=$(($testfailures+1))
  40         fi
  41     fi
  42
  43     testduration=$(($(date +%s)-$teststarttime))
  44
  45     echo "=========================================================================="
  46     echo "TEST ${interp}: ${name}${statstr} (duration: ${testduration}s)"
  47     echo "=========================================================================="
  48
  49 }
  50
  51 test_exit ()
  52 {
  53     exit $(($testfailures+0))
  54 }
  55
  56 ctdb_test_exit ()
  57 {
  58     local status=$?
  59
  60     trap - 0
  61
  62     [ $(($testfailures+0)) -eq 0 -a $status -ne 0 ] && testfailures=$status
  63     status=$(($testfailures+0))
  64
  65     # Avoid making a test fail from this point onwards.  The test is
  66     # now complete.
  67     set +e
  68
  69     echo "*** TEST COMPLETE (RC=$status), CLEANING UP..."
  70
  71     eval "$ctdb_test_exit_hook" || true
  72     unset ctdb_test_exit_hook
  73
  74     if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
  75
  76         restart_ctdb
  77     else
  78         # This could be made unconditional but then we might get
  79         # duplication from the recovery in restart_ctdb.  We want to
  80         # leave the recovery in restart_ctdb so that future tests that
  81         # might do a manual restart mid-test will benefit.
  82         echo "Forcing a recovery..."
  83         onnode 0 ctdb recover
  84     fi
  85
  86     exit $status
  87 }
  88
  89 ctdb_test_exit_hook_add ()
  90 {
  91     ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
  92 }
  93
  94 ctdb_test_run ()
  95 {
  96     local name="$1" ; shift
  97
  98     [ -n "$1" ] || set -- "$name"
  99
 100     ctdb_test_begin "$name"
 101
 102     local status=0
 103     "$@" || status=$?
 104
 105     ctdb_test_end "$name" "$status" "$*"
 106
 107     return $status
 108 }
 109
 110 ctdb_test_usage()
 111 {
 112     local status=${1:-2}
 113
 114     cat <<EOF
 115 Usage: $0 [option]
 116
 117 Options:
 118     -h, --help          show this screen.
 119     -v, --version       show test case version.
 120     --category          show the test category (ACL, CTDB, Samba ...).
 121     -d, --description   show test case description.
 122     --summary           show short test case summary.
 123     -x                  trace test using set -x
 124 EOF
 125
 126     exit $status
 127 }
 128
 129 ctdb_test_version ()
 130 {
 131     [ -n "$CTDB_DIR" ] || fail "Can not determine version."
 132
 133     (cd "$CTDB_DIR" && git describe)
 134 }
 135
 136 ctdb_test_cmd_options()
 137 {
 138     [ -n "$1" ] || return 0
 139
 140     case "$1" in
 141         -h|--help)        ctdb_test_usage 0   ;;
 142         -v|--version)     ctdb_test_version   ;;
 143         --category)       echo "CTDB"         ;;
 144         -d|--description) test_info           ;;
 145         -x)               set -x ; return 0   ;;
 146         *)
 147             echo "Error: Unknown parameter = $1"
 148             echo
 149             ctdb_test_usage 2
 150             ;;
 151     esac
 152
 153     exit 0
 154 }
 155
 156 ctdb_test_init ()
 157 {
 158     scriptname=$(basename "$0")
 159     testfailures=0
 160     ctdb_test_restart_scheduled=false
 161
 162     ctdb_test_cmd_options $@
 163
 164     trap "ctdb_test_exit" 0
 165 }
 166
 167 ctdb_test_check_real_cluster ()
 168 {
 169     [ -n "$CTDB_TEST_REAL_CLUSTER" ] && return 0
 170
 171     echo "ERROR: This test must be run on a real/virtual cluster, not local daemons."
 172     return 1
 173 }
 174
 175 ########################################
 176
 177 # Sets: $out
 178 try_command_on_node ()
 179 {
 180     local nodespec="$1" ; shift
 181
 182     local verbose=false
 183     local onnode_opts=""
 184
 185     while [ "${nodespec#-}" != "$nodespec" ] ; do
 186         if [ "$nodespec" = "-v" ] ; then
 187             verbose=true
 188         else
 189             onnode_opts="$nodespec"
 190         fi
 191         nodespec="$1" ; shift
 192     done
 193
 194     local cmd="$*"
 195
 196     out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
 197
 198         echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
 199         echo "$out"
 200         return 1
 201     }
 202
 203     if $verbose ; then
 204         echo "Output of \"$cmd\":"
 205         echo "$out"
 206     fi
 207 }
 208
 209 sanity_check_output ()
 210 {
 211     local min_lines="$1"
 212     local regexp="$2" # Should be anchored as necessary.
 213     local output="$3"
 214
 215     local ret=0
 216
 217     local num_lines=$(echo "$output" | wc -l)
 218     echo "There are $num_lines lines of output"
 219     if [ $num_lines -lt $min_lines ] ; then
 220         echo "BAD: that's less than the required number (${min_lines})"
 221         ret=1
 222     fi
 223
 224     local status=0
 225     local unexpected # local doesn't pass through status of command on RHS.
 226     unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
 227
 228     # Note that this is reversed.
 229     if [ $status -eq 0 ] ; then
 230         echo "BAD: unexpected lines in output:"
 231         echo "$unexpected" | cat -A
 232         ret=1
 233     else
 234         echo "Output lines look OK"
 235     fi
 236
 237     return $ret
 238 }
 239
 240 sanity_check_ips ()
 241 {
 242     local ips="$1" # list of "ip node" lines
 243
 244     echo "Sanity checking IPs..."
 245
 246     local x ipp prev
 247     prev=""
 248     while read x ipp ; do
 249         [ "$ipp" = "-1" ] && break
 250         if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
 251             echo "OK"
 252             return 0
 253         fi
 254         prev="$ipp"
 255     done <<<"$ips"
 256
 257     echo "BAD: a node was -1 or IPs are only assigned to one node"
 258     echo "Are you running an old version of CTDB?"
 259     return 1
 260 }
 261
 262 # This returns a list of "ip node" lines in $out
 263 all_ips_on_node()
 264 {
 265     local node=$@
 266     try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
 267 }
 268
 269 select_test_node_and_ips ()
 270 {
 271     all_ips_on_node 0
 272
 273     # When selecting test_node we just want a node that has public
 274     # IPs.  This will work and is economically semi-random.  :-)
 275     local x
 276     read x test_node <<<"$out"
 277
 278     test_node_ips=""
 279     local ip pnn
 280     while read ip pnn ; do
 281         if [ "$pnn" = "$test_node" ] ; then
 282             test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
 283         fi
 284     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 285
 286     echo "Selected node ${test_node} with IPs: ${test_node_ips}."
 287     test_ip="${test_node_ips%% *}"
 288 }
 289
 290 #######################################
 291
 292 # Wait until either timeout expires or command succeeds.  The command
 293 # will be tried once per second.
 294 wait_until ()
 295 {
 296     local timeout="$1" ; shift # "$@" is the command...
 297
 298     local negate=false
 299     if [ "$1" = "!" ] ; then
 300         negate=true
 301         shift
 302     fi
 303
 304     echo -n "<${timeout}|"
 305     local t=$timeout
 306     while [ $t -gt 0 ] ; do
 307         local rc=0
 308         "$@" || rc=$?
 309         if { ! $negate && [ $rc -eq 0 ] ; } || \
 310             { $negate && [ $rc -ne 0 ] ; } ; then
 311             echo "|$(($timeout - $t))|"
 312             echo "OK"
 313             return 0
 314         fi
 315         echo -n .
 316         t=$(($t - 1))
 317         sleep 1
 318     done
 319
 320     echo "*TIMEOUT*"
 321
 322     return 1
 323 }
 324
 325 sleep_for ()
 326 {
 327     echo -n "=${1}|"
 328     for i in $(seq 1 $1) ; do
 329         echo -n '.'
 330         sleep 1
 331     done
 332     echo '|'
 333 }
 334
 335 _cluster_is_healthy ()
 336 {
 337     local out x count line
 338
 339     out=$(ctdb -Y status 2>&1) || return 1
 340
 341     {
 342         read x
 343         count=0
 344         while read line ; do
 345             count=$(($count + 1))
 346             [ "${line##:*:*:*1:}" != "$line" ] && return 1
 347         done
 348         [ $count -gt 0 ] && return $?
 349     } <<<"$out" # Yay bash!
 350 }
 351
 352 cluster_is_healthy ()
 353 {
 354     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 355         echo "Cluster is HEALTHY"
 356         return 0
 357     else
 358         echo "Cluster is UNHEALTHY"
 359         if ! ${ctdb_test_restart_scheduled:-false} ; then
 360             echo "DEBUG:"
 361             local i
 362             for i in "onnode -q 0 ctdb status" "onnode -q 0 onnode all ctdb scriptstatus" ; do
 363                 echo "$i"
 364                 $i || true
 365             done
 366         fi
 367         return 1
 368     fi
 369 }
 370
 371 wait_until_healthy ()
 372 {
 373     local timeout="${1:-120}"
 374
 375     echo "Waiting for cluster to become healthy..."
 376
 377     wait_until 120 _cluster_is_healthy
 378 }
 379
 380 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
 381 node_has_status ()
 382 {
 383     local pnn="$1"
 384     local status="$2"
 385
 386     local bits fpat mpat
 387     case "$status" in
 388         (unhealthy)    bits="?:?:?:1:*" ;;
 389         (healthy)      bits="?:?:?:0:*" ;;
 390         (disconnected) bits="1:*" ;;
 391         (connected)    bits="0:*" ;;
 392         (banned)       bits="?:1:*" ;;
 393         (unbanned)     bits="?:0:*" ;;
 394         (disabled)     bits="?:?:1:*" ;;
 395         (enabled)      bits="?:?:0:*" ;;
 396         (stopped)      bits="?:?:?:?:1:*" ;;
 397         (notstopped)   bits="?:?:?:?:0:*" ;;
 398         (frozen)       fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
 399         (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
 400         (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
 401         (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
 402         *)
 403             echo "node_has_status: unknown status \"$status\""
 404             return 1
 405     esac
 406
 407     if [ -n "$bits" ] ; then
 408         local out x line
 409
 410         out=$(ctdb -Y status 2>&1) || return 1
 411
 412         {
 413             read x
 414             while read line ; do
 415                 # This needs to be done in 2 steps to avoid false matches.
 416                 local line_bits="${line#:${pnn}:*:}"
 417                 [ "$line_bits" = "$line" ] && continue
 418                 [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
 419             done
 420             return 1
 421         } <<<"$out" # Yay bash!
 422     elif [ -n "$fpat" ] ; then
 423         ctdb statistics -n "$pnn" | egrep -q "$fpat"
 424     elif [ -n "$mpat" ] ; then
 425         ctdb getmonmode -n "$pnn" | egrep -q "$mpat"
 426     else
 427         echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
 428         return 1
 429     fi
 430 }
 431
 432 wait_until_node_has_status ()
 433 {
 434     local pnn="$1"
 435     local status="$2"
 436     local timeout="${3:-30}"
 437
 438     echo "Waiting until node $pnn has status \"$status\"..."
 439
 440     if ! onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status" ; then
 441         for i in "onnode -q any ctdb status" "onnode -q any onnode all ctdb scriptstatus" ; do
 442             echo "$i"
 443             $i || true
 444         done
 445
 446         return 1
 447     fi
 448
 449 }
 450
 451 # Useful for superficially testing IP failover.
 452 # IPs must be on nodes matching nodeglob.
 453 ips_are_on_nodeglob ()
 454 {
 455     local nodeglob="$1" ; shift
 456     local ips="$*"
 457
 458     local out
 459
 460     all_ips_on_node 1
 461
 462     while read ip pnn ; do
 463         for check in $ips ; do
 464             if [ "$check" = "$ip" ] ; then
 465                 case "$pnn" in
 466                     ($nodeglob) : ;;
 467                     (*) return 1  ;;
 468                 esac
 469                 ips="${ips/${ip}}" # Remove from list
 470             fi
 471         done
 472     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 473
 474     ips="${ips// }" # Remove any spaces.
 475     [ -z "$ips" ]
 476 }
 477
 478 wait_until_ips_are_on_nodeglob ()
 479 {
 480     echo "Waiting for IPs to fail over..."
 481
 482     wait_until 60 ips_are_on_nodeglob "$@"
 483 }
 484
 485 get_src_socket ()
 486 {
 487     local proto="$1"
 488     local dst_socket="$2"
 489     local pid="$3"
 490     local prog="$4"
 491
 492     local pat="^${proto}[[:space:]]+[[:digit:]]+[[:space:]]+[[:digit:]]+[[:space:]]+[^[:space:]]+[[:space:]]+${dst_socket//./\\.}[[:space:]]+ESTABLISHED[[:space:]]+${pid}/${prog}[[:space:]]*\$"
 493     out=$(netstat -tanp |
 494         egrep "$pat" |
 495         awk '{ print $4 }')
 496
 497     [ -n "$out" ]
 498 }
 499
 500 wait_until_get_src_socket ()
 501 {
 502     local proto="$1"
 503     local dst_socket="$2"
 504     local pid="$3"
 505     local prog="$4"
 506
 507     echo "Waiting for ${prog} to establish connection to ${dst_socket}..."
 508
 509     wait_until 5 get_src_socket "$@"
 510 }
 511
 512 #######################################
 513
 514 # filename will be in $tcpdump_filename, pid in $tcpdump_pid
 515 tcpdump_start ()
 516 {
 517     tcpdump_filter="$1" # global
 518
 519     echo "Running tcpdump..."
 520     tcpdump_filename=$(mktemp)
 521     ctdb_test_exit_hook_add "rm -f $tcpdump_filename"
 522
 523     # The only way of being sure that tcpdump is listening is to send
 524     # some packets that it will see.  So we use dummy pings - the -U
 525     # option to tcpdump ensures that packets are flushed to the file
 526     # as they are captured.
 527     local dummy_addr="127.3.2.1"
 528     local dummy="icmp and dst host ${dummy_addr} and icmp[icmptype] == icmp-echo"
 529     tcpdump -n -p -s 0 -e -U -w $tcpdump_filename -i any "($tcpdump_filter) or ($dummy)" &
 530     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 531
 532     echo "Waiting for tcpdump output file to be ready..."
 533     ping -q "$dummy_addr" >/dev/null 2>&1 &
 534     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 535
 536     tcpdump_listen_for_dummy ()
 537     {
 538         tcpdump -n -r $tcpdump_filename -c 1 "$dummy" >/dev/null 2>&1
 539     }
 540
 541     wait_until 10 tcpdump_listen_for_dummy
 542 }
 543
 544 # By default, wait for 1 matching packet.
 545 tcpdump_wait ()
 546 {
 547     local count="${1:-1}"
 548     local filter="${2:-${tcpdump_filter}}"
 549
 550     tcpdump_check ()
 551     {
 552         local found=$(tcpdump -n -r $tcpdump_filename "$filter" 2>/dev/null | wc -l)
 553         [ $found -ge $count ]
 554     }
 555
 556     echo "Waiting for tcpdump to capture some packets..."
 557     if ! wait_until 30 tcpdump_check ; then
 558         echo "DEBUG:"
 559         local i
 560         for i in "onnode -q 0 ctdb status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
 561             echo "$i"
 562             $i || true
 563         done
 564         return 1
 565     fi
 566 }
 567
 568 tcpdump_show ()
 569 {
 570     local filter="${1:-${tcpdump_filter}}"
 571
 572     tcpdump -n -r $tcpdump_filename  "$filter" 2>/dev/null
 573 }
 574
 575 tcptickle_sniff_start ()
 576 {
 577     local src="$1"
 578     local dst="$2"
 579
 580     local in="src host ${dst%:*} and tcp src port ${dst##*:} and dst host ${src%:*} and tcp dst port ${src##*:}"
 581     local out="src host ${src%:*} and tcp src port ${src##*:} and dst host ${dst%:*} and tcp dst port ${dst##*:}"
 582     local tickle_ack="${in} and (tcp[tcpflags] & tcp-ack != 0) and (tcp[14] == 4) and (tcp[15] == 210)" # win == 1234
 583     local ack_ack="${out} and (tcp[tcpflags] & tcp-ack != 0)"
 584     tcptickle_reset="${in} and tcp[tcpflags] & tcp-rst != 0"
 585     local filter="(${tickle_ack}) or (${ack_ack}) or (${tcptickle_reset})"
 586
 587     tcpdump_start "$filter"
 588 }
 589
 590 tcptickle_sniff_wait_show ()
 591 {
 592     tcpdump_wait 1 "$tcptickle_reset"
 593
 594     echo "GOOD: here are some TCP tickle packets:"
 595     tcpdump_show
 596 }
 597
 598 gratarp_sniff_start ()
 599 {
 600     tcpdump_start "arp host ${test_ip}"
 601 }
 602
 603 gratarp_sniff_wait_show ()
 604 {
 605     tcpdump_wait 2
 606
 607     echo "GOOD: this should be the some gratuitous ARPs:"
 608     tcpdump_show
 609 }
 610
 611
 612 #######################################
 613
 614 daemons_stop ()
 615 {
 616     echo "Attempting to politely shutdown daemons..."
 617     onnode 1 ctdb shutdown -n all || true
 618
 619     echo "Sleeping for a while..."
 620     sleep_for 1
 621
 622     if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 623         echo "Killing remaining daemons..."
 624         pkill -f $CTDB_DIR/bin/ctdbd
 625
 626         if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 627             echo "Once more with feeling.."
 628             pkill -9 $CTDB_DIR/bin/ctdbd
 629         fi
 630     fi
 631
 632     local var_dir=$CTDB_DIR/tests/var
 633     rm -rf $var_dir/test.db
 634 }
 635
 636 daemons_setup ()
 637 {
 638     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 639
 640     local var_dir=$CTDB_DIR/tests/var
 641
 642     mkdir -p $var_dir/test.db/persistent
 643
 644     local nodes=$var_dir/nodes.txt
 645     local public_addresses=$var_dir/public_addresses.txt
 646     local no_public_addresses=$var_dir/no_public_addresses.txt
 647     rm -f $nodes $public_addresses $no_public_addresses
 648
 649     # If there are (strictly) greater than 2 nodes then we'll randomly
 650     # choose a node to have no public addresses.
 651     local no_public_ips=-1
 652     [ $num_nodes -gt 2 ] && no_public_ips=$(($RANDOM % $num_nodes))
 653     echo "$no_public_ips" >$no_public_addresses
 654
 655     local i
 656     for i in $(seq 1 $num_nodes) ; do
 657         if [ "${CTDB_USE_IPV6}x" != "x" ]; then
 658             echo ::$i >> $nodes
 659             ip addr add ::$i/128 dev lo
 660         else
 661             echo 127.0.0.$i >> $nodes
 662             # 2 public addresses on most nodes, just to make things interesting.
 663             if [ $(($i - 1)) -ne $no_public_ips ] ; then
 664                 echo "192.0.2.$i/24 lo" >> $public_addresses
 665                 echo "192.0.2.$(($i + $num_nodes))/24 lo" >> $public_addresses
 666             fi
 667         fi
 668     done
 669 }
 670
 671 daemons_start_1 ()
 672 {
 673     local pnn="$1"
 674     shift # "$@" gets passed to ctdbd
 675
 676     local var_dir=$CTDB_DIR/tests/var
 677
 678     local nodes=$var_dir/nodes.txt
 679     local public_addresses=$var_dir/public_addresses.txt
 680     local no_public_addresses=$var_dir/no_public_addresses.txt
 681
 682     local no_public_ips=-1
 683     [ -r $no_public_addresses ] && read no_public_ips <$no_public_addresses
 684
 685     if  [ "$no_public_ips" = $pnn ] ; then
 686         echo "Node $no_public_ips will have no public IPs."
 687     fi
 688
 689     local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
 690
 691     if [ $(id -u) -eq 0 ]; then
 692         ctdb_options="$ctdb_options --public-interface=lo"
 693     fi
 694
 695     if [ $pnn -eq $no_public_ips ] ; then
 696         ctdb_options="$ctdb_options --public-addresses=/dev/null"
 697     else
 698         ctdb_options="$ctdb_options --public-addresses=$public_addresses"
 699     fi
 700
 701     # Need full path so we can use "pkill -f" to kill the daemons.
 702     $VALGRIND $CTDB_DIR/bin/ctdbd --socket=$var_dir/sock.$pnn $ctdb_options "$@" ||return 1
 703 }
 704
 705 daemons_start ()
 706 {
 707     # "$@" gets passed to ctdbd
 708
 709     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 710
 711     echo "Starting $num_nodes ctdb daemons..."
 712
 713     for i in $(seq 0 $(($num_nodes - 1))) ; do
 714         daemons_start_1 $i "$@"
 715     done
 716
 717     local var_dir=$CTDB_DIR/tests/var
 718
 719     if [ -L /tmp/ctdb.socket -o ! -S /tmp/ctdb.socket ] ; then
 720         ln -sf $var_dir/sock.0 /tmp/ctdb.socket || return 1
 721     fi
 722 }
 723
 724 #######################################
 725
 726 _ctdb_hack_options ()
 727 {
 728     local ctdb_options="$*"
 729
 730     # We really just want to pass CTDB_OPTIONS but on RH
 731     # /etc/sysconfig/ctdb can, and frequently does, set that variable.
 732     # So instead, we hack badly.  We'll add these as we use them.
 733     # Note that these may still be overridden by the above file... but
 734     # we tend to use the exotic options here... so that is unlikely.
 735
 736     case "$ctdb_options" in
 737         *--start-as-stopped*)
 738             export CTDB_START_AS_STOPPED="yes"
 739     esac
 740 }
 741
 742 _restart_ctdb ()
 743 {
 744     _ctdb_hack_options "$@"
 745
 746     if [ -e /etc/redhat-release ] ; then
 747         service ctdb restart
 748     else
 749         /etc/init.d/ctdb restart
 750     fi
 751 }
 752
 753 _ctdb_start ()
 754 {
 755     _ctdb_hack_options "$@"
 756
 757     /etc/init.d/ctdb start
 758 }
 759
 760 setup_ctdb ()
 761 {
 762     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 763         daemons_setup
 764     fi
 765 }
 766
 767 # Common things to do after starting one or more nodes.
 768 _ctdb_start_post ()
 769 {
 770     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 771
 772     echo "Setting RerecoveryTimeout to 1"
 773     onnode -pq all "ctdb setvar RerecoveryTimeout 1"
 774
 775     # In recent versions of CTDB, forcing a recovery like this blocks
 776     # until the recovery is complete.  Hopefully this will help the
 777     # cluster to stabilise before a subsequent test.
 778     echo "Forcing a recovery..."
 779     onnode -q 0 ctdb recover
 780     sleep_for 1
 781     echo "Forcing a recovery..."
 782     onnode -q 0 ctdb recover
 783
 784     echo "ctdb is ready"
 785 }
 786
 787 # This assumes that ctdbd is not running on the given node.
 788 ctdb_start_1 ()
 789 {
 790     local pnn="$1"
 791     shift # "$@" is passed to ctdbd start.
 792
 793     echo -n "Starting CTDB on node ${pnn}..."
 794
 795     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 796         daemons_start_1 $pnn "$@"
 797     else
 798         onnode $pnn $CTDB_TEST_WRAPPER _ctdb_start "$@"
 799     fi
 800
 801     # If we're starting only 1 node then we're doing something weird.
 802     ctdb_restart_when_done
 803 }
 804
 805 restart_ctdb ()
 806 {
 807     # "$@" is passed to ctdbd start.
 808
 809     echo -n "Restarting CTDB"
 810     if $ctdb_test_restart_scheduled ; then
 811         echo -n " (scheduled)"
 812     fi
 813     echo "..."
 814
 815     local i=0
 816     while : ; do
 817         if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 818             daemons_stop
 819             daemons_start "$@"
 820         else
 821             onnode -p all $CTDB_TEST_WRAPPER _restart_ctdb "$@"
 822         fi && break
 823
 824         i=$(($i + 1))
 825         [ $i -lt 5 ] || break
 826
 827         echo "That didn't seem to work - sleeping for a while..."
 828         sleep_for 5
 829     done
 830
 831     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 832
 833     echo "Setting RerecoveryTimeout to 1"
 834     onnode -pq all "ctdb setvar RerecoveryTimeout 1"
 835
 836     # In recent versions of CTDB, forcing a recovery like this blocks
 837     # until the recovery is complete.  Hopefully this will help the
 838     # cluster to stabilise before a subsequent test.
 839     echo "Forcing a recovery..."
 840     onnode -q 0 ctdb recover
 841     sleep_for 1
 842     echo "Forcing a recovery..."
 843     onnode -q 0 ctdb recover
 844
 845     echo "ctdb is ready"
 846 }
 847
 848 ctdb_restart_when_done ()
 849 {
 850     ctdb_test_restart_scheduled=true
 851 }
 852
 853 #######################################
 854
 855 install_eventscript ()
 856 {
 857     local script_name="$1"
 858     local script_contents="$2"
 859
 860     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 861         # The quoting here is *very* fragile.  However, we do
 862         # experience the joy of installing a short script using
 863         # onnode, and without needing to know the IP addresses of the
 864         # nodes.
 865         onnode all "f=\"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\" ; echo \"Installing \$f\" ; echo '${script_contents}' > \"\$f\" ; chmod 755 \"\$f\""
 866     else
 867         f="${CTDB_DIR}/tests/events.d/${script_name}"
 868         echo "$script_contents" >"$f"
 869         chmod 755 "$f"
 870     fi
 871 }
 872
 873 uninstall_eventscript ()
 874 {
 875     local script_name="$1"
 876
 877     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 878         onnode all "rm -vf \"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\""
 879     else
 880         rm -vf "${CTDB_DIR}/tests/events.d/${script_name}"
 881     fi
 882 }
 883
 884 #######################################
 885
 886 # This section deals with the 99.ctdb_test eventscript.
 887
 888 # Metafunctions: Handle a ctdb-test file on a node.
 889 # given event.
 890 ctdb_test_eventscript_file_create ()
 891 {
 892     local pnn="$1"
 893     local type="$2"
 894
 895     try_command_on_node $pnn touch "/tmp/ctdb-test-${type}.${pnn}"
 896 }
 897
 898 ctdb_test_eventscript_file_remove ()
 899 {
 900     local pnn="$1"
 901     local type="$2"
 902
 903     try_command_on_node $pnn rm -f "/tmp/ctdb-test-${type}.${pnn}"
 904 }
 905
 906 ctdb_test_eventscript_file_exists ()
 907 {
 908     local pnn="$1"
 909     local type="$2"
 910
 911     try_command_on_node $pnn test -f "/tmp/ctdb-test-${type}.${pnn}" >/dev/null 2>&1
 912 }
 913
 914
 915 # Handle a flag file on a node that is removed by 99.ctdb_test on the
 916 # given event.
 917 ctdb_test_eventscript_flag ()
 918 {
 919     local cmd="$1"
 920     local pnn="$2"
 921     local event="$3"
 922
 923     ctdb_test_eventscript_file_${cmd} "$pnn" "flag-${event}"
 924 }
 925
 926
 927 # Handle a trigger that causes 99.ctdb_test to fail it's monitor
 928 # event.
 929 ctdb_test_eventscript_unhealthy_trigger ()
 930 {
 931     local cmd="$1"
 932     local pnn="$2"
 933
 934     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-trigger"
 935 }
 936
 937 # Handle the file that 99.ctdb_test created to show that it has marked
 938 # a node unhealthy because it detected the above trigger.
 939 ctdb_test_eventscript_unhealthy_detected ()
 940 {
 941     local cmd="$1"
 942     local pnn="$2"
 943
 944     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
 945 }
 946
 947 # Handle a trigger that causes 99.ctdb_test to timeout it's monitor
 948 # event.  This should cause the node to be banned.
 949 ctdb_test_eventscript_timeout_trigger ()
 950 {
 951     local cmd="$1"
 952     local pnn="$2"
 953     local event="$3"
 954
 955     ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
 956 }
 957
 958 # Note that the eventscript can't use the above functions!
 959 ctdb_test_eventscript_install ()
 960 {
 961
 962     local script='#!/bin/sh
 963 out=$(ctdb pnn)
 964 pnn="${out#PNN:}"
 965
 966 rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
 967
 968 trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
 969 detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
 970 timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
 971 case "$1" in
 972     monitor)
 973         if [ -e "$trigger" ] ; then
 974             echo "${0}: Unhealthy because \"$trigger\" detected"
 975             touch "$detected"
 976             exit 1
 977         elif [ -e "$detected" -a ! -e "$trigger" ] ; then
 978             echo "${0}: Healthy again, \"$trigger\" no longer detected"
 979             rm "$detected"
 980         fi
 981
 982         ;;
 983     *)
 984         if [ -e "$timeout_trigger" ] ; then
 985             echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
 986             sleep 9999
 987         fi
 988         ;;
 989         *)
 990
 991 esac
 992
 993 exit 0
 994 '
 995     install_eventscript "99.ctdb_test" "$script"
 996 }
 997
 998 ctdb_test_eventscript_uninstall ()
 999 {
1000     uninstall_eventscript "99.ctdb_test"
1001 }
1002
1003 # Note that this only works if you know all other monitor events will
1004 # succeed.  You also need to install the eventscript before using it.
1005 wait_for_monitor_event ()
1006 {
1007     local pnn="$1"
1008
1009     echo "Waiting for a monitor event on node ${pnn}..."
1010     ctdb_test_eventscript_flag create $pnn "monitor"
1011
1012     wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
1013
1014 }