# now complete.
set +e
- echo "*** TEST COMPLETE (RC=$status), CLEANING UP..."
+ echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
eval "$ctdb_test_exit_hook" || true
unset ctdb_test_exit_hook
# leave the recovery in restart_ctdb so that future tests that
# might do a manual restart mid-test will benefit.
echo "Forcing a recovery..."
- onnode 0 ctdb recover
+ onnode 0 $CTDB recover
fi
exit $status
sanity_check_ips ()
{
- local ips="$1" # Output of "ctdb ip -n all"
+ local ips="$1" # list of "ip node" lines
echo "Sanity checking IPs..."
return 1
}
+# This returns a list of "ip node" lines in $out
+all_ips_on_node()
+{
+ local node=$@
+ try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
+}
+
select_test_node_and_ips ()
{
- try_command_on_node 0 "$CTDB ip -n all | sed -e '1d'"
+ all_ips_on_node 0
# When selecting test_node we just want a node that has public
# IPs. This will work and is economically semi-random. :-)
{
local out x count line
- out=$(ctdb -Y status 2>&1) || return 1
+ out=$($CTDB -Y status 2>/dev/null) || return 1
{
read x
count=0
while read line ; do
- count=$(($count + 1))
+ # We need to see valid lines if we're going to be healthy.
+ [ "${line#:[0-9]}" != "$line" ] && count=$(($count + 1))
+ # A line indicating a node is unhealthy causes failure.
[ "${line##:*:*:*1:}" != "$line" ] && return 1
done
[ $count -gt 0 ] && return $?
else
echo "Cluster is UNHEALTHY"
if ! ${ctdb_test_restart_scheduled:-false} ; then
- echo "DEBUG:"
+ echo "DEBUG AT $(date '+%F %T'):"
local i
- for i in "onnode -q 0 ctdb status" "onnode -q 0 onnode all ctdb scriptstatus" ; do
+ for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
echo "$i"
$i || true
done
if [ -n "$bits" ] ; then
local out x line
- out=$(ctdb -Y status 2>&1) || return 1
+ out=$($CTDB -Y status 2>&1) || return 1
{
read x
return 1
} <<<"$out" # Yay bash!
elif [ -n "$fpat" ] ; then
- ctdb statistics -n "$pnn" | egrep -q "$fpat"
+ $CTDB statistics -n "$pnn" | egrep -q "$fpat"
elif [ -n "$mpat" ] ; then
- ctdb getmonmode -n "$pnn" | egrep -q "$mpat"
+ $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
else
echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
return 1
echo "Waiting until node $pnn has status \"$status\"..."
- if ! onnode any $CTDB_TEST_WRAPPER wait_until $timeout node_has_status "$pnn" "$status" ; then
- for i in "onnode -q any ctdb status" "onnode -q any onnode all ctdb scriptstatus" ; do
+ if ! wait_until $timeout onnode any $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
+ for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
echo "$i"
$i || true
done
local out
- try_command_on_node 1 ctdb ip -n all
+ all_ips_on_node 1
while read ip pnn ; do
for check in $ips ; do
wait_until 60 ips_are_on_nodeglob "$@"
}
+node_has_some_ips ()
+{
+ local node="$1"
+
+ local out
+
+ all_ips_on_node 1
+
+ while read ip pnn ; do
+ if [ "$node" = "$pnn" ] ; then
+ return 0
+ fi
+ done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+ return 1
+}
+
+wait_until_node_has_some_ips ()
+{
+ echo "Waiting for node to have some IPs..."
+
+ wait_until 60 node_has_some_ips "$@"
+}
+
get_src_socket ()
{
local proto="$1"
echo "Waiting for tcpdump to capture some packets..."
if ! wait_until 30 tcpdump_check ; then
- echo "DEBUG:"
+ echo "DEBUG AT $(date '+%F %T'):"
local i
- for i in "onnode -q 0 ctdb status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
+ for i in "onnode -q 0 $CTDB status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
echo "$i"
$i || true
done
daemons_stop ()
{
echo "Attempting to politely shutdown daemons..."
- onnode 1 ctdb shutdown -n all || true
+ onnode 1 $CTDB shutdown -n all || true
echo "Sleeping for a while..."
sleep_for 1
echo "Node $no_public_ips will have no public IPs."
fi
- local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent"
+ local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
if [ $(id -u) -eq 0 ]; then
ctdb_options="$ctdb_options --public-interface=lo"
onnode -q 1 $CTDB_TEST_WRAPPER wait_until_healthy || return 1
echo "Setting RerecoveryTimeout to 1"
- onnode -pq all "ctdb setvar RerecoveryTimeout 1"
+ onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
# In recent versions of CTDB, forcing a recovery like this blocks
# until the recovery is complete. Hopefully this will help the
# cluster to stabilise before a subsequent test.
echo "Forcing a recovery..."
- onnode -q 0 ctdb recover
+ onnode -q 0 $CTDB recover
sleep_for 1
echo "Forcing a recovery..."
- onnode -q 0 ctdb recover
+ onnode -q 0 $CTDB recover
echo "ctdb is ready"
}
onnode -q 1 $CTDB_TEST_WRAPPER wait_until_healthy || return 1
echo "Setting RerecoveryTimeout to 1"
- onnode -pq all "ctdb setvar RerecoveryTimeout 1"
+ onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
# In recent versions of CTDB, forcing a recovery like this blocks
# until the recovery is complete. Hopefully this will help the
# cluster to stabilise before a subsequent test.
echo "Forcing a recovery..."
- onnode -q 0 ctdb recover
+ onnode -q 0 $CTDB recover
sleep_for 1
echo "Forcing a recovery..."
- onnode -q 0 ctdb recover
+ onnode -q 0 $CTDB recover
echo "ctdb is ready"
}
ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
}
+# Handle a trigger that causes 99.ctdb_test to timeout it's monitor
+# event. This should cause the node to be banned.
+ctdb_test_eventscript_timeout_trigger ()
+{
+ local cmd="$1"
+ local pnn="$2"
+ local event="$3"
+
+ ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
+}
+
# Note that the eventscript can't use the above functions!
ctdb_test_eventscript_install ()
{
local script='#!/bin/sh
-out=$(ctdb pnn)
+out=$($CTDB pnn)
pnn="${out#PNN:}"
rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
-if [ "$1" = "monitor" ] ; then
- if [ -e "$trigger" ] ; then
- echo "${0}: Unhealthy because \"$trigger\" detected"
- touch "$detected"
- exit 1
- elif [ -e "$detected" -a ! -e "$trigger" ] ; then
- echo "${0}: Healthy again, \"$trigger\" no longer detected"
- rm "$detected"
- fi
-fi
+timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
+case "$1" in
+ monitor)
+ if [ -e "$trigger" ] ; then
+ echo "${0}: Unhealthy because \"$trigger\" detected"
+ touch "$detected"
+ exit 1
+ elif [ -e "$detected" -a ! -e "$trigger" ] ; then
+ echo "${0}: Healthy again, \"$trigger\" no longer detected"
+ rm "$detected"
+ fi
+
+ ;;
+ *)
+ if [ -e "$timeout_trigger" ] ; then
+ echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
+ sleep 9999
+ fi
+ ;;
+ *)
+
+esac
exit 0
'
{
local pnn="$1"
- echo "Waiting for a monitor event on node $pnn to complete..."
+ echo "Waiting for a monitor event on node ${pnn}..."
ctdb_test_eventscript_flag create $pnn "monitor"
wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
}
+
+# Make sure that $CTDB is set.
+: ${CTDB:=ctdb}