Test suite: Add more timestamping of debugging information.

[metze/ctdb/wip.git] / tests / scripts / ctdb_test_functions.bash
diff --git a/tests/scripts/ctdb_test_functions.bash b/tests/scripts/ctdb_test_functions.bash

index 832a42bf16892907a874dc822a581aa304857579..68f7994da07e29336c4af9655509f4319975de65 100644 (file)
--- a/tests/scripts/ctdb_test_functions.bash
+++ b/tests/scripts/ctdb_test_functions.bash
@@ -66,7 +66,7 @@ ctdb_test_exit ()
      # now complete.
      set +e
  
-    echo "*** TEST COMPLETE (RC=$status), CLEANING UP..."
+    echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
  
      eval "$ctdb_test_exit_hook" || true
      unset ctdb_test_exit_hook
@@ -80,7 +80,7 @@ ctdb_test_exit ()
         # leave the recovery in restart_ctdb so that future tests that
         # might do a manual restart mid-test will benefit.
         echo "Forcing a recovery..."
-       onnode 0 ctdb recover
+       onnode 0 $CTDB recover
      fi
  
      exit $status
@@ -239,7 +239,7 @@ sanity_check_output ()
  
  sanity_check_ips ()
  {
-    local ips="$1" # Output of "ctdb ip -n all"
+    local ips="$1" # list of "ip node" lines
  
      echo "Sanity checking IPs..."
  
@@ -259,9 +259,16 @@ sanity_check_ips ()
      return 1
  }
  
+# This returns a list of "ip node" lines in $out
+all_ips_on_node()
+{
+    local node=$@
+    try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
+}
+
  select_test_node_and_ips ()
  {
-    try_command_on_node 0 "$CTDB ip -n all | sed -e '1d'"
+    all_ips_on_node 0
  
      # When selecting test_node we just want a node that has public
      # IPs.  This will work and is economically semi-random.  :-)
@@ -329,13 +336,15 @@ _cluster_is_healthy ()
  {
      local out x count line
  
-    out=$(ctdb -Y status 2>&1) || return 1
+    out=$($CTDB -Y status 2>/dev/null) || return 1
  
      {
          read x
         count=0
          while read line ; do
-           count=$(($count + 1))
+           # We need to see valid lines if we're going to be healthy.
+           [ "${line#:[0-9]}" != "$line" ] && count=$(($count + 1))
+           # A line indicating a node is unhealthy causes failure.
             [ "${line##:*:*:*1:}" != "$line" ] && return 1
          done
         [ $count -gt 0 ] && return $?
@@ -350,9 +359,9 @@ cluster_is_healthy ()
      else
         echo "Cluster is UNHEALTHY"
         if ! ${ctdb_test_restart_scheduled:-false} ; then
-           echo "DEBUG:"
+           echo "DEBUG AT $(date '+%F %T'):"
             local i
-           for i in "onnode -q 0 ctdb status" "onnode -q 0 onnode all ctdb scriptstatus" ; do
+           for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
                 echo "$i"
                 $i || true
             done
@@ -400,7 +409,7 @@ node_has_status ()
      if [ -n "$bits" ] ; then
         local out x line
  
-       out=$(ctdb -Y status 2>&1) || return 1
+       out=$($CTDB -Y status 2>&1) || return 1
  
         {
              read x
@@ -413,9 +422,9 @@ node_has_status ()
             return 1
         } <<<"$out" # Yay bash!
      elif [ -n "$fpat" ] ; then
-       ctdb statistics -n "$pnn" | egrep -q "$fpat"
+       $CTDB statistics -n "$pnn" | egrep -q "$fpat"
      elif [ -n "$mpat" ] ; then
-       ctdb getmonmode -n "$pnn" | egrep -q "$mpat"
+       $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
      else
         echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
         return 1
@@ -430,8 +439,8 @@ wait_until_node_has_status ()
  
      echo "Waiting until node $pnn has status \"$status\"..."
  
-    if ! onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status" ; then
-       for i in "onnode -q any ctdb status" "onnode -q any onnode all ctdb scriptstatus" ; do
+    if ! wait_until $timeout onnode any $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
+       for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
             echo "$i"
             $i || true
         done
@@ -450,7 +459,7 @@ ips_are_on_nodeglob ()
  
      local out
  
-    try_command_on_node 1 ctdb ip -n all
+    all_ips_on_node 1
  
      while read ip pnn ; do
         for check in $ips ; do
@@ -475,6 +484,30 @@ wait_until_ips_are_on_nodeglob ()
      wait_until 60 ips_are_on_nodeglob "$@"
  }
  
+node_has_some_ips ()
+{
+    local node="$1"
+
+    local out
+
+    all_ips_on_node 1
+
+    while read ip pnn ; do
+       if [ "$node" = "$pnn" ] ; then
+           return 0
+       fi
+    done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+    return 1
+}
+
+wait_until_node_has_some_ips ()
+{
+    echo "Waiting for node to have some IPs..."
+
+    wait_until 60 node_has_some_ips "$@"
+}
+
  get_src_socket ()
  {
      local proto="$1"
@@ -548,9 +581,9 @@ tcpdump_wait ()
  
      echo "Waiting for tcpdump to capture some packets..."
      if ! wait_until 30 tcpdump_check ; then
-       echo "DEBUG:"
+       echo "DEBUG AT $(date '+%F %T'):"
         local i
-       for i in "onnode -q 0 ctdb status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
+       for i in "onnode -q 0 $CTDB status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
             echo "$i"
             $i || true
         done
@@ -607,7 +640,7 @@ gratarp_sniff_wait_show ()
  daemons_stop ()
  {
      echo "Attempting to politely shutdown daemons..."
-    onnode 1 ctdb shutdown -n all || true
+    onnode 1 $CTDB shutdown -n all || true
  
      echo "Sleeping for a while..."
      sleep_for 1
@@ -679,7 +712,7 @@ daemons_start_1 ()
         echo "Node $no_public_ips will have no public IPs."
      fi
  
-    local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent"
+    local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
  
      if [ $(id -u) -eq 0 ]; then
          ctdb_options="$ctdb_options --public-interface=lo"
@@ -763,16 +796,16 @@ _ctdb_start_post ()
      onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
  
      echo "Setting RerecoveryTimeout to 1"
-    onnode -pq all "ctdb setvar RerecoveryTimeout 1"
+    onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
  
      # In recent versions of CTDB, forcing a recovery like this blocks
      # until the recovery is complete.  Hopefully this will help the
      # cluster to stabilise before a subsequent test.
      echo "Forcing a recovery..."
-    onnode -q 0 ctdb recover
+    onnode -q 0 $CTDB recover
      sleep_for 1
      echo "Forcing a recovery..."
-    onnode -q 0 ctdb recover
+    onnode -q 0 $CTDB recover
  
      echo "ctdb is ready"
  }
@@ -824,16 +857,16 @@ restart_ctdb ()
      onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
  
      echo "Setting RerecoveryTimeout to 1"
-    onnode -pq all "ctdb setvar RerecoveryTimeout 1"
+    onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
  
      # In recent versions of CTDB, forcing a recovery like this blocks
      # until the recovery is complete.  Hopefully this will help the
      # cluster to stabilise before a subsequent test.
      echo "Forcing a recovery..."
-    onnode -q 0 ctdb recover
+    onnode -q 0 $CTDB recover
      sleep_for 1
      echo "Forcing a recovery..."
-    onnode -q 0 ctdb recover
+    onnode -q 0 $CTDB recover
  
      echo "ctdb is ready"
  }
@@ -937,28 +970,51 @@ ctdb_test_eventscript_unhealthy_detected ()
      ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
  }
  
+# Handle a trigger that causes 99.ctdb_test to timeout it's monitor
+# event.  This should cause the node to be banned.
+ctdb_test_eventscript_timeout_trigger ()
+{
+    local cmd="$1"
+    local pnn="$2"
+    local event="$3"
+
+    ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
+}
+
  # Note that the eventscript can't use the above functions!
  ctdb_test_eventscript_install ()
  {
  
      local script='#!/bin/sh
-out=$(ctdb pnn)
+out=$($CTDB pnn)
  pnn="${out#PNN:}"
  
  rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
  
  trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
  detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
-if [ "$1" = "monitor" ] ; then
-    if [ -e "$trigger" ] ; then
-        echo "${0}: Unhealthy because \"$trigger\" detected"
-        touch "$detected"
-        exit 1
-    elif [ -e "$detected" -a ! -e "$trigger" ] ; then
-        echo "${0}: Healthy again, \"$trigger\" no longer detected"
-        rm "$detected"
-    fi
-fi
+timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
+case "$1" in
+    monitor)
+        if [ -e "$trigger" ] ; then
+            echo "${0}: Unhealthy because \"$trigger\" detected"
+            touch "$detected"
+            exit 1
+        elif [ -e "$detected" -a ! -e "$trigger" ] ; then
+            echo "${0}: Healthy again, \"$trigger\" no longer detected"
+            rm "$detected"
+        fi
+       
+       ;;
+    *)
+        if [ -e "$timeout_trigger" ] ; then
+            echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
+            sleep 9999
+        fi
+       ;;
+       *)
+       
+esac
  
  exit 0
  '
@@ -976,9 +1032,12 @@ wait_for_monitor_event ()
  {
      local pnn="$1"
  
-    echo "Waiting for a monitor event on node $pnn to complete..."
+    echo "Waiting for a monitor event on node ${pnn}..."
      ctdb_test_eventscript_flag create $pnn "monitor"
  
      wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
  
  }
+
+# Make sure that $CTDB is set.
+: ${CTDB:=ctdb}