tests/scripts/ctdb_test_functions.bash

   1 # Hey Emacs, this is a -*- shell-script -*- !!!  :-)
   2
   3 fail ()
   4 {
   5     echo "$*"
   6     exit 1
   7 }
   8
   9 ######################################################################
  10
  11 ctdb_test_begin ()
  12 {
  13     local name="$1"
  14
  15     teststarttime=$(date '+%s')
  16     testduration=0
  17
  18     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  19     echo "Running test $name ($(date '+%T'))"
  20     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  21 }
  22
  23 ctdb_test_end ()
  24 {
  25     local name="$1" ; shift
  26     local status="$1" ; shift
  27     # "$@" is command-line
  28
  29     local interp="SKIPPED"
  30     local statstr=" (reason $*)"
  31     if [ -n "$status" ] ; then
  32         if [ $status -eq 0 ] ; then
  33             interp="PASSED"
  34             statstr=""
  35             echo "ALL OK: $*"
  36         else
  37             interp="FAILED"
  38             statstr=" (status $status)"
  39             testfailures=$(($testfailures+1))
  40         fi
  41     fi
  42
  43     testduration=$(($(date +%s)-$teststarttime))
  44
  45     echo "=========================================================================="
  46     echo "TEST ${interp}: ${name}${statstr} (duration: ${testduration}s)"
  47     echo "=========================================================================="
  48
  49 }
  50
  51 test_exit ()
  52 {
  53     exit $(($testfailures+0))
  54 }
  55
  56 ctdb_test_exit ()
  57 {
  58     local status=$?
  59
  60     trap - 0
  61
  62     [ $(($testfailures+0)) -eq 0 -a $status -ne 0 ] && testfailures=$status
  63     status=$(($testfailures+0))
  64
  65     # Avoid making a test fail from this point onwards.  The test is
  66     # now complete.
  67     set +e
  68
  69     echo "*** TEST COMPLETE (RC=$status), CLEANING UP..."
  70
  71     eval "$ctdb_test_exit_hook" || true
  72     unset ctdb_test_exit_hook
  73
  74     if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
  75
  76         restart_ctdb
  77     else
  78         # This could be made unconditional but then we might get
  79         # duplication from the recovery in restart_ctdb.  We want to
  80         # leave the recovery in restart_ctdb so that future tests that
  81         # might do a manual restart mid-test will benefit.
  82         echo "Forcing a recovery..."
  83         onnode 0 ctdb recover
  84     fi
  85
  86     exit $status
  87 }
  88
  89 ctdb_test_exit_hook_add ()
  90 {
  91     ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
  92 }
  93
  94 ctdb_test_run ()
  95 {
  96     local name="$1" ; shift
  97
  98     [ -n "$1" ] || set -- "$name"
  99
 100     ctdb_test_begin "$name"
 101
 102     local status=0
 103     "$@" || status=$?
 104
 105     ctdb_test_end "$name" "$status" "$*"
 106
 107     return $status
 108 }
 109
 110 ctdb_test_usage()
 111 {
 112     local status=${1:-2}
 113
 114     cat <<EOF
 115 Usage: $0 [option]
 116
 117 Options:
 118     -h, --help          show this screen.
 119     -v, --version       show test case version.
 120     --category          show the test category (ACL, CTDB, Samba ...).
 121     -d, --description   show test case description.
 122     --summary           show short test case summary.
 123     -x                  trace test using set -x
 124 EOF
 125
 126     exit $status
 127 }
 128
 129 ctdb_test_version ()
 130 {
 131     [ -n "$CTDB_DIR" ] || fail "Can not determine version."
 132
 133     (cd "$CTDB_DIR" && git describe)
 134 }
 135
 136 ctdb_test_cmd_options()
 137 {
 138     [ -n "$1" ] || return 0
 139
 140     case "$1" in
 141         -h|--help)        ctdb_test_usage 0   ;;
 142         -v|--version)     ctdb_test_version   ;;
 143         --category)       echo "CTDB"         ;;
 144         -d|--description) test_info           ;;
 145         -x)               set -x ; return 0   ;;
 146         *)
 147             echo "Error: Unknown parameter = $1"
 148             echo
 149             ctdb_test_usage 2
 150             ;;
 151     esac
 152
 153     exit 0
 154 }
 155
 156 ctdb_test_init ()
 157 {
 158     scriptname=$(basename "$0")
 159     testfailures=0
 160     ctdb_test_restart_scheduled=false
 161
 162     ctdb_test_cmd_options $@
 163
 164     trap "ctdb_test_exit" 0
 165 }
 166
 167 ctdb_test_check_real_cluster ()
 168 {
 169     [ -n "$CTDB_TEST_REAL_CLUSTER" ] && return 0
 170
 171     echo "ERROR: This test must be run on a real/virtual cluster, not local daemons."
 172     return 1
 173 }
 174
 175 ########################################
 176
 177 # Sets: $out
 178 try_command_on_node ()
 179 {
 180     local nodespec="$1" ; shift
 181
 182     local verbose=false
 183     local onnode_opts=""
 184
 185     while [ "${nodespec#-}" != "$nodespec" ] ; do
 186         if [ "$nodespec" = "-v" ] ; then
 187             verbose=true
 188         else
 189             onnode_opts="$nodespec"
 190         fi
 191         nodespec="$1" ; shift
 192     done
 193
 194     local cmd="$*"
 195
 196     out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
 197
 198         echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
 199         echo "$out"
 200         return 1
 201     }
 202
 203     if $verbose ; then
 204         echo "Output of \"$cmd\":"
 205         echo "$out"
 206     fi
 207 }
 208
 209 sanity_check_output ()
 210 {
 211     local min_lines="$1"
 212     local regexp="$2" # Should be anchored as necessary.
 213     local output="$3"
 214
 215     local ret=0
 216
 217     local num_lines=$(echo "$output" | wc -l)
 218     echo "There are $num_lines lines of output"
 219     if [ $num_lines -lt $min_lines ] ; then
 220         echo "BAD: that's less than the required number (${min_lines})"
 221         ret=1
 222     fi
 223
 224     local status=0
 225     local unexpected # local doesn't pass through status of command on RHS.
 226     unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
 227
 228     # Note that this is reversed.
 229     if [ $status -eq 0 ] ; then
 230         echo "BAD: unexpected lines in output:"
 231         echo "$unexpected" | cat -A
 232         ret=1
 233     else
 234         echo "Output lines look OK"
 235     fi
 236
 237     return $ret
 238 }
 239
 240 sanity_check_ips ()
 241 {
 242     local ips="$1" # list of "ip node" lines
 243
 244     echo "Sanity checking IPs..."
 245
 246     local x ipp prev
 247     prev=""
 248     while read x ipp ; do
 249         [ "$ipp" = "-1" ] && break
 250         if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
 251             echo "OK"
 252             return 0
 253         fi
 254         prev="$ipp"
 255     done <<<"$ips"
 256
 257     echo "BAD: a node was -1 or IPs are only assigned to one node"
 258     echo "Are you running an old version of CTDB?"
 259     return 1
 260 }
 261
 262 # This returns a list of "ip node" lines in $out
 263 all_ips_on_node()
 264 {
 265     local node=$@
 266     try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
 267 }
 268
 269 select_test_node_and_ips ()
 270 {
 271     all_ips_on_node 0
 272
 273     # When selecting test_node we just want a node that has public
 274     # IPs.  This will work and is economically semi-random.  :-)
 275     local x
 276     read x test_node <<<"$out"
 277
 278     test_node_ips=""
 279     local ip pnn
 280     while read ip pnn ; do
 281         if [ "$pnn" = "$test_node" ] ; then
 282             test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
 283         fi
 284     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 285
 286     echo "Selected node ${test_node} with IPs: ${test_node_ips}."
 287     test_ip="${test_node_ips%% *}"
 288 }
 289
 290 #######################################
 291
 292 # Wait until either timeout expires or command succeeds.  The command
 293 # will be tried once per second.
 294 wait_until ()
 295 {
 296     local timeout="$1" ; shift # "$@" is the command...
 297
 298     local negate=false
 299     if [ "$1" = "!" ] ; then
 300         negate=true
 301         shift
 302     fi
 303
 304     echo -n "<${timeout}|"
 305     local t=$timeout
 306     while [ $t -gt 0 ] ; do
 307         local rc=0
 308         "$@" || rc=$?
 309         if { ! $negate && [ $rc -eq 0 ] ; } || \
 310             { $negate && [ $rc -ne 0 ] ; } ; then
 311             echo "|$(($timeout - $t))|"
 312             echo "OK"
 313             return 0
 314         fi
 315         echo -n .
 316         t=$(($t - 1))
 317         sleep 1
 318     done
 319
 320     echo "*TIMEOUT*"
 321
 322     return 1
 323 }
 324
 325 sleep_for ()
 326 {
 327     echo -n "=${1}|"
 328     for i in $(seq 1 $1) ; do
 329         echo -n '.'
 330         sleep 1
 331     done
 332     echo '|'
 333 }
 334
 335 _cluster_is_healthy ()
 336 {
 337     local out x count line
 338
 339     out=$(ctdb -Y status 2>&1) || return 1
 340
 341     {
 342         read x
 343         count=0
 344         while read line ; do
 345             count=$(($count + 1))
 346             [ "${line##:*:*:*1:}" != "$line" ] && return 1
 347         done
 348         [ $count -gt 0 ] && return $?
 349     } <<<"$out" # Yay bash!
 350 }
 351
 352 cluster_is_healthy ()
 353 {
 354     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 355         echo "Cluster is HEALTHY"
 356         return 0
 357     else
 358         echo "Cluster is UNHEALTHY"
 359         if ! ${ctdb_test_restart_scheduled:-false} ; then
 360             echo "DEBUG:"
 361             local i
 362             for i in "onnode -q 0 ctdb status" "onnode -q 0 onnode all ctdb scriptstatus" ; do
 363                 echo "$i"
 364                 $i || true
 365             done
 366         fi
 367         return 1
 368     fi
 369 }
 370
 371 wait_until_healthy ()
 372 {
 373     local timeout="${1:-120}"
 374
 375     echo "Waiting for cluster to become healthy..."
 376
 377     wait_until 120 _cluster_is_healthy
 378 }
 379
 380 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
 381 node_has_status ()
 382 {
 383     local pnn="$1"
 384     local status="$2"
 385
 386     local bits fpat mpat
 387     case "$status" in
 388         (unhealthy)    bits="?:?:?:1:*" ;;
 389         (healthy)      bits="?:?:?:0:*" ;;
 390         (disconnected) bits="1:*" ;;
 391         (connected)    bits="0:*" ;;
 392         (banned)       bits="?:1:*" ;;
 393         (unbanned)     bits="?:0:*" ;;
 394         (disabled)     bits="?:?:1:*" ;;
 395         (enabled)      bits="?:?:0:*" ;;
 396         (stopped)      bits="?:?:?:?:1:*" ;;
 397         (notstopped)   bits="?:?:?:?:0:*" ;;
 398         (frozen)       fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
 399         (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
 400         (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
 401         (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
 402         *)
 403             echo "node_has_status: unknown status \"$status\""
 404             return 1
 405     esac
 406
 407     if [ -n "$bits" ] ; then
 408         local out x line
 409
 410         out=$(ctdb -Y status 2>&1) || return 1
 411
 412         {
 413             read x
 414             while read line ; do
 415                 # This needs to be done in 2 steps to avoid false matches.
 416                 local line_bits="${line#:${pnn}:*:}"
 417                 [ "$line_bits" = "$line" ] && continue
 418                 [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
 419             done
 420             return 1
 421         } <<<"$out" # Yay bash!
 422     elif [ -n "$fpat" ] ; then
 423         ctdb statistics -n "$pnn" | egrep -q "$fpat"
 424     elif [ -n "$mpat" ] ; then
 425         ctdb getmonmode -n "$pnn" | egrep -q "$mpat"
 426     else
 427         echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
 428         return 1
 429     fi
 430 }
 431
 432 wait_until_node_has_status ()
 433 {
 434     local pnn="$1"
 435     local status="$2"
 436     local timeout="${3:-30}"
 437
 438     echo "Waiting until node $pnn has status \"$status\"..."
 439
 440     if ! onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status" ; then
 441         for i in "onnode -q any ctdb status" "onnode -q any onnode all ctdb scriptstatus" ; do
 442             echo "$i"
 443             $i || true
 444         done
 445
 446         return 1
 447     fi
 448
 449 }
 450
 451 # Useful for superficially testing IP failover.
 452 # IPs must be on nodes matching nodeglob.
 453 ips_are_on_nodeglob ()
 454 {
 455     local nodeglob="$1" ; shift
 456     local ips="$*"
 457
 458     local out
 459
 460     all_ips_on_node 1
 461
 462     while read ip pnn ; do
 463         for check in $ips ; do
 464             if [ "$check" = "$ip" ] ; then
 465                 case "$pnn" in
 466                     ($nodeglob) : ;;
 467                     (*) return 1  ;;
 468                 esac
 469                 ips="${ips/${ip}}" # Remove from list
 470             fi
 471         done
 472     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 473
 474     ips="${ips// }" # Remove any spaces.
 475     [ -z "$ips" ]
 476 }
 477
 478 wait_until_ips_are_on_nodeglob ()
 479 {
 480     echo "Waiting for IPs to fail over..."
 481
 482     wait_until 60 ips_are_on_nodeglob "$@"
 483 }
 484
 485 node_has_some_ips ()
 486 {
 487     local node="$1"
 488
 489     local out
 490
 491     all_ips_on_node 1
 492
 493     while read ip pnn ; do
 494         if [ "$node" = "$pnn" ] ; then
 495             return 0
 496         fi
 497     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 498
 499     return 1
 500 }
 501
 502 wait_until_node_has_some_ips ()
 503 {
 504     echo "Waiting for node to have some IPs..."
 505
 506     wait_until 60 node_has_some_ips "$@"
 507 }
 508
 509 get_src_socket ()
 510 {
 511     local proto="$1"
 512     local dst_socket="$2"
 513     local pid="$3"
 514     local prog="$4"
 515
 516     local pat="^${proto}[[:space:]]+[[:digit:]]+[[:space:]]+[[:digit:]]+[[:space:]]+[^[:space:]]+[[:space:]]+${dst_socket//./\\.}[[:space:]]+ESTABLISHED[[:space:]]+${pid}/${prog}[[:space:]]*\$"
 517     out=$(netstat -tanp |
 518         egrep "$pat" |
 519         awk '{ print $4 }')
 520
 521     [ -n "$out" ]
 522 }
 523
 524 wait_until_get_src_socket ()
 525 {
 526     local proto="$1"
 527     local dst_socket="$2"
 528     local pid="$3"
 529     local prog="$4"
 530
 531     echo "Waiting for ${prog} to establish connection to ${dst_socket}..."
 532
 533     wait_until 5 get_src_socket "$@"
 534 }
 535
 536 #######################################
 537
 538 # filename will be in $tcpdump_filename, pid in $tcpdump_pid
 539 tcpdump_start ()
 540 {
 541     tcpdump_filter="$1" # global
 542
 543     echo "Running tcpdump..."
 544     tcpdump_filename=$(mktemp)
 545     ctdb_test_exit_hook_add "rm -f $tcpdump_filename"
 546
 547     # The only way of being sure that tcpdump is listening is to send
 548     # some packets that it will see.  So we use dummy pings - the -U
 549     # option to tcpdump ensures that packets are flushed to the file
 550     # as they are captured.
 551     local dummy_addr="127.3.2.1"
 552     local dummy="icmp and dst host ${dummy_addr} and icmp[icmptype] == icmp-echo"
 553     tcpdump -n -p -s 0 -e -U -w $tcpdump_filename -i any "($tcpdump_filter) or ($dummy)" &
 554     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 555
 556     echo "Waiting for tcpdump output file to be ready..."
 557     ping -q "$dummy_addr" >/dev/null 2>&1 &
 558     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 559
 560     tcpdump_listen_for_dummy ()
 561     {
 562         tcpdump -n -r $tcpdump_filename -c 1 "$dummy" >/dev/null 2>&1
 563     }
 564
 565     wait_until 10 tcpdump_listen_for_dummy
 566 }
 567
 568 # By default, wait for 1 matching packet.
 569 tcpdump_wait ()
 570 {
 571     local count="${1:-1}"
 572     local filter="${2:-${tcpdump_filter}}"
 573
 574     tcpdump_check ()
 575     {
 576         local found=$(tcpdump -n -r $tcpdump_filename "$filter" 2>/dev/null | wc -l)
 577         [ $found -ge $count ]
 578     }
 579
 580     echo "Waiting for tcpdump to capture some packets..."
 581     if ! wait_until 30 tcpdump_check ; then
 582         echo "DEBUG:"
 583         local i
 584         for i in "onnode -q 0 ctdb status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
 585             echo "$i"
 586             $i || true
 587         done
 588         return 1
 589     fi
 590 }
 591
 592 tcpdump_show ()
 593 {
 594     local filter="${1:-${tcpdump_filter}}"
 595
 596     tcpdump -n -r $tcpdump_filename  "$filter" 2>/dev/null
 597 }
 598
 599 tcptickle_sniff_start ()
 600 {
 601     local src="$1"
 602     local dst="$2"
 603
 604     local in="src host ${dst%:*} and tcp src port ${dst##*:} and dst host ${src%:*} and tcp dst port ${src##*:}"
 605     local out="src host ${src%:*} and tcp src port ${src##*:} and dst host ${dst%:*} and tcp dst port ${dst##*:}"
 606     local tickle_ack="${in} and (tcp[tcpflags] & tcp-ack != 0) and (tcp[14] == 4) and (tcp[15] == 210)" # win == 1234
 607     local ack_ack="${out} and (tcp[tcpflags] & tcp-ack != 0)"
 608     tcptickle_reset="${in} and tcp[tcpflags] & tcp-rst != 0"
 609     local filter="(${tickle_ack}) or (${ack_ack}) or (${tcptickle_reset})"
 610
 611     tcpdump_start "$filter"
 612 }
 613
 614 tcptickle_sniff_wait_show ()
 615 {
 616     tcpdump_wait 1 "$tcptickle_reset"
 617
 618     echo "GOOD: here are some TCP tickle packets:"
 619     tcpdump_show
 620 }
 621
 622 gratarp_sniff_start ()
 623 {
 624     tcpdump_start "arp host ${test_ip}"
 625 }
 626
 627 gratarp_sniff_wait_show ()
 628 {
 629     tcpdump_wait 2
 630
 631     echo "GOOD: this should be the some gratuitous ARPs:"
 632     tcpdump_show
 633 }
 634
 635
 636 #######################################
 637
 638 daemons_stop ()
 639 {
 640     echo "Attempting to politely shutdown daemons..."
 641     onnode 1 ctdb shutdown -n all || true
 642
 643     echo "Sleeping for a while..."
 644     sleep_for 1
 645
 646     if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 647         echo "Killing remaining daemons..."
 648         pkill -f $CTDB_DIR/bin/ctdbd
 649
 650         if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 651             echo "Once more with feeling.."
 652             pkill -9 $CTDB_DIR/bin/ctdbd
 653         fi
 654     fi
 655
 656     local var_dir=$CTDB_DIR/tests/var
 657     rm -rf $var_dir/test.db
 658 }
 659
 660 daemons_setup ()
 661 {
 662     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 663
 664     local var_dir=$CTDB_DIR/tests/var
 665
 666     mkdir -p $var_dir/test.db/persistent
 667
 668     local nodes=$var_dir/nodes.txt
 669     local public_addresses=$var_dir/public_addresses.txt
 670     local no_public_addresses=$var_dir/no_public_addresses.txt
 671     rm -f $nodes $public_addresses $no_public_addresses
 672
 673     # If there are (strictly) greater than 2 nodes then we'll randomly
 674     # choose a node to have no public addresses.
 675     local no_public_ips=-1
 676     [ $num_nodes -gt 2 ] && no_public_ips=$(($RANDOM % $num_nodes))
 677     echo "$no_public_ips" >$no_public_addresses
 678
 679     local i
 680     for i in $(seq 1 $num_nodes) ; do
 681         if [ "${CTDB_USE_IPV6}x" != "x" ]; then
 682             echo ::$i >> $nodes
 683             ip addr add ::$i/128 dev lo
 684         else
 685             echo 127.0.0.$i >> $nodes
 686             # 2 public addresses on most nodes, just to make things interesting.
 687             if [ $(($i - 1)) -ne $no_public_ips ] ; then
 688                 echo "192.0.2.$i/24 lo" >> $public_addresses
 689                 echo "192.0.2.$(($i + $num_nodes))/24 lo" >> $public_addresses
 690             fi
 691         fi
 692     done
 693 }
 694
 695 daemons_start_1 ()
 696 {
 697     local pnn="$1"
 698     shift # "$@" gets passed to ctdbd
 699
 700     local var_dir=$CTDB_DIR/tests/var
 701
 702     local nodes=$var_dir/nodes.txt
 703     local public_addresses=$var_dir/public_addresses.txt
 704     local no_public_addresses=$var_dir/no_public_addresses.txt
 705
 706     local no_public_ips=-1
 707     [ -r $no_public_addresses ] && read no_public_ips <$no_public_addresses
 708
 709     if  [ "$no_public_ips" = $pnn ] ; then
 710         echo "Node $no_public_ips will have no public IPs."
 711     fi
 712
 713     local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
 714
 715     if [ $(id -u) -eq 0 ]; then
 716         ctdb_options="$ctdb_options --public-interface=lo"
 717     fi
 718
 719     if [ $pnn -eq $no_public_ips ] ; then
 720         ctdb_options="$ctdb_options --public-addresses=/dev/null"
 721     else
 722         ctdb_options="$ctdb_options --public-addresses=$public_addresses"
 723     fi
 724
 725     # Need full path so we can use "pkill -f" to kill the daemons.
 726     $VALGRIND $CTDB_DIR/bin/ctdbd --socket=$var_dir/sock.$pnn $ctdb_options "$@" ||return 1
 727 }
 728
 729 daemons_start ()
 730 {
 731     # "$@" gets passed to ctdbd
 732
 733     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 734
 735     echo "Starting $num_nodes ctdb daemons..."
 736
 737     for i in $(seq 0 $(($num_nodes - 1))) ; do
 738         daemons_start_1 $i "$@"
 739     done
 740
 741     local var_dir=$CTDB_DIR/tests/var
 742
 743     if [ -L /tmp/ctdb.socket -o ! -S /tmp/ctdb.socket ] ; then
 744         ln -sf $var_dir/sock.0 /tmp/ctdb.socket || return 1
 745     fi
 746 }
 747
 748 #######################################
 749
 750 _ctdb_hack_options ()
 751 {
 752     local ctdb_options="$*"
 753
 754     # We really just want to pass CTDB_OPTIONS but on RH
 755     # /etc/sysconfig/ctdb can, and frequently does, set that variable.
 756     # So instead, we hack badly.  We'll add these as we use them.
 757     # Note that these may still be overridden by the above file... but
 758     # we tend to use the exotic options here... so that is unlikely.
 759
 760     case "$ctdb_options" in
 761         *--start-as-stopped*)
 762             export CTDB_START_AS_STOPPED="yes"
 763     esac
 764 }
 765
 766 _restart_ctdb ()
 767 {
 768     _ctdb_hack_options "$@"
 769
 770     if [ -e /etc/redhat-release ] ; then
 771         service ctdb restart
 772     else
 773         /etc/init.d/ctdb restart
 774     fi
 775 }
 776
 777 _ctdb_start ()
 778 {
 779     _ctdb_hack_options "$@"
 780
 781     /etc/init.d/ctdb start
 782 }
 783
 784 setup_ctdb ()
 785 {
 786     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 787         daemons_setup
 788     fi
 789 }
 790
 791 # Common things to do after starting one or more nodes.
 792 _ctdb_start_post ()
 793 {
 794     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 795
 796     echo "Setting RerecoveryTimeout to 1"
 797     onnode -pq all "ctdb setvar RerecoveryTimeout 1"
 798
 799     # In recent versions of CTDB, forcing a recovery like this blocks
 800     # until the recovery is complete.  Hopefully this will help the
 801     # cluster to stabilise before a subsequent test.
 802     echo "Forcing a recovery..."
 803     onnode -q 0 ctdb recover
 804     sleep_for 1
 805     echo "Forcing a recovery..."
 806     onnode -q 0 ctdb recover
 807
 808     echo "ctdb is ready"
 809 }
 810
 811 # This assumes that ctdbd is not running on the given node.
 812 ctdb_start_1 ()
 813 {
 814     local pnn="$1"
 815     shift # "$@" is passed to ctdbd start.
 816
 817     echo -n "Starting CTDB on node ${pnn}..."
 818
 819     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 820         daemons_start_1 $pnn "$@"
 821     else
 822         onnode $pnn $CTDB_TEST_WRAPPER _ctdb_start "$@"
 823     fi
 824
 825     # If we're starting only 1 node then we're doing something weird.
 826     ctdb_restart_when_done
 827 }
 828
 829 restart_ctdb ()
 830 {
 831     # "$@" is passed to ctdbd start.
 832
 833     echo -n "Restarting CTDB"
 834     if $ctdb_test_restart_scheduled ; then
 835         echo -n " (scheduled)"
 836     fi
 837     echo "..."
 838
 839     local i=0
 840     while : ; do
 841         if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 842             daemons_stop
 843             daemons_start "$@"
 844         else
 845             onnode -p all $CTDB_TEST_WRAPPER _restart_ctdb "$@"
 846         fi && break
 847
 848         i=$(($i + 1))
 849         [ $i -lt 5 ] || break
 850
 851         echo "That didn't seem to work - sleeping for a while..."
 852         sleep_for 5
 853     done
 854
 855     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 856
 857     echo "Setting RerecoveryTimeout to 1"
 858     onnode -pq all "ctdb setvar RerecoveryTimeout 1"
 859
 860     # In recent versions of CTDB, forcing a recovery like this blocks
 861     # until the recovery is complete.  Hopefully this will help the
 862     # cluster to stabilise before a subsequent test.
 863     echo "Forcing a recovery..."
 864     onnode -q 0 ctdb recover
 865     sleep_for 1
 866     echo "Forcing a recovery..."
 867     onnode -q 0 ctdb recover
 868
 869     echo "ctdb is ready"
 870 }
 871
 872 ctdb_restart_when_done ()
 873 {
 874     ctdb_test_restart_scheduled=true
 875 }
 876
 877 #######################################
 878
 879 install_eventscript ()
 880 {
 881     local script_name="$1"
 882     local script_contents="$2"
 883
 884     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 885         # The quoting here is *very* fragile.  However, we do
 886         # experience the joy of installing a short script using
 887         # onnode, and without needing to know the IP addresses of the
 888         # nodes.
 889         onnode all "f=\"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\" ; echo \"Installing \$f\" ; echo '${script_contents}' > \"\$f\" ; chmod 755 \"\$f\""
 890     else
 891         f="${CTDB_DIR}/tests/events.d/${script_name}"
 892         echo "$script_contents" >"$f"
 893         chmod 755 "$f"
 894     fi
 895 }
 896
 897 uninstall_eventscript ()
 898 {
 899     local script_name="$1"
 900
 901     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 902         onnode all "rm -vf \"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\""
 903     else
 904         rm -vf "${CTDB_DIR}/tests/events.d/${script_name}"
 905     fi
 906 }
 907
 908 #######################################
 909
 910 # This section deals with the 99.ctdb_test eventscript.
 911
 912 # Metafunctions: Handle a ctdb-test file on a node.
 913 # given event.
 914 ctdb_test_eventscript_file_create ()
 915 {
 916     local pnn="$1"
 917     local type="$2"
 918
 919     try_command_on_node $pnn touch "/tmp/ctdb-test-${type}.${pnn}"
 920 }
 921
 922 ctdb_test_eventscript_file_remove ()
 923 {
 924     local pnn="$1"
 925     local type="$2"
 926
 927     try_command_on_node $pnn rm -f "/tmp/ctdb-test-${type}.${pnn}"
 928 }
 929
 930 ctdb_test_eventscript_file_exists ()
 931 {
 932     local pnn="$1"
 933     local type="$2"
 934
 935     try_command_on_node $pnn test -f "/tmp/ctdb-test-${type}.${pnn}" >/dev/null 2>&1
 936 }
 937
 938
 939 # Handle a flag file on a node that is removed by 99.ctdb_test on the
 940 # given event.
 941 ctdb_test_eventscript_flag ()
 942 {
 943     local cmd="$1"
 944     local pnn="$2"
 945     local event="$3"
 946
 947     ctdb_test_eventscript_file_${cmd} "$pnn" "flag-${event}"
 948 }
 949
 950
 951 # Handle a trigger that causes 99.ctdb_test to fail it's monitor
 952 # event.
 953 ctdb_test_eventscript_unhealthy_trigger ()
 954 {
 955     local cmd="$1"
 956     local pnn="$2"
 957
 958     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-trigger"
 959 }
 960
 961 # Handle the file that 99.ctdb_test created to show that it has marked
 962 # a node unhealthy because it detected the above trigger.
 963 ctdb_test_eventscript_unhealthy_detected ()
 964 {
 965     local cmd="$1"
 966     local pnn="$2"
 967
 968     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
 969 }
 970
 971 # Handle a trigger that causes 99.ctdb_test to timeout it's monitor
 972 # event.  This should cause the node to be banned.
 973 ctdb_test_eventscript_timeout_trigger ()
 974 {
 975     local cmd="$1"
 976     local pnn="$2"
 977     local event="$3"
 978
 979     ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
 980 }
 981
 982 # Note that the eventscript can't use the above functions!
 983 ctdb_test_eventscript_install ()
 984 {
 985
 986     local script='#!/bin/sh
 987 out=$(ctdb pnn)
 988 pnn="${out#PNN:}"
 989
 990 rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
 991
 992 trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
 993 detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
 994 timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
 995 case "$1" in
 996     monitor)
 997         if [ -e "$trigger" ] ; then
 998             echo "${0}: Unhealthy because \"$trigger\" detected"
 999             touch "$detected"
1000             exit 1
1001         elif [ -e "$detected" -a ! -e "$trigger" ] ; then
1002             echo "${0}: Healthy again, \"$trigger\" no longer detected"
1003             rm "$detected"
1004         fi
1005
1006         ;;
1007     *)
1008         if [ -e "$timeout_trigger" ] ; then
1009             echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
1010             sleep 9999
1011         fi
1012         ;;
1013         *)
1014
1015 esac
1016
1017 exit 0
1018 '
1019     install_eventscript "99.ctdb_test" "$script"
1020 }
1021
1022 ctdb_test_eventscript_uninstall ()
1023 {
1024     uninstall_eventscript "99.ctdb_test"
1025 }
1026
1027 # Note that this only works if you know all other monitor events will
1028 # succeed.  You also need to install the eventscript before using it.
1029 wait_for_monitor_event ()
1030 {
1031     local pnn="$1"
1032
1033     echo "Waiting for a monitor event on node ${pnn}..."
1034     ctdb_test_eventscript_flag create $pnn "monitor"
1035
1036     wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
1037
1038 }