ctdb_mutex_ceph_rados_helper: fix deadlock via lock renewals
[samba.git] / ctdb / utils / ceph / test_ceph_rados_reclock.sh
1 #!/bin/bash
2 # standalone test for ctdb_mutex_ceph_rados_helper
3 #
4 # Copyright (C) David Disseldorp 2016
5 #
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19 # XXX The following parameters may require configuration:
20 CLUSTER="ceph"                          # Name of the Ceph cluster under test
21 USER="client.admin"                     # Ceph user - a keyring must exist
22 POOL="rbd"                              # RADOS pool - must exist
23 OBJECT="ctdb_reclock"                   # RADOS object: target for lock requests
24
25 # test procedure:
26 # - using ctdb_mutex_ceph_rados_helper, take a lock on the Ceph RADOS object at
27 #   CLUSTER/$POOL/$OBJECT using the Ceph keyring for $USER
28 #   + confirm that lock is obtained, via ctdb_mutex_ceph_rados_helper "0" output
29 # - check RADOS object lock state, using the "rados lock info" command
30 # - attempt to obtain the lock again, using ctdb_mutex_ceph_rados_helper
31 #   + confirm that the lock is not successfully taken ("1" output=contention)
32 # - tell the first locker to drop the lock and exit, via SIGTERM
33 # - once the first locker has exited, attempt to get the lock again
34 #   + confirm that this attempt succeeds
35
36 function _fail() {
37         echo "FAILED: $*"
38         exit 1
39 }
40
41 # this test requires the Ceph "rados" binary, and "jq" json parser
42 which jq > /dev/null || exit 1
43 which rados > /dev/null || exit 1
44 which ctdb_mutex_ceph_rados_helper || exit 1
45
46 TMP_DIR="$(mktemp --directory)" || exit 1
47 rados -p "$POOL" rm "$OBJECT"
48
49 (ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" \
50                                                         > ${TMP_DIR}/first) &
51 locker_pid=$!
52
53 # TODO wait for ctdb_mutex_ceph_rados_helper to write one byte to stdout,
54 # indicating lock acquisition success/failure
55 sleep 1
56
57 first_out=$(cat ${TMP_DIR}/first)
58 [ "$first_out" == "0" ] \
59         || _fail "expected lock acquisition (0), but got $first_out"
60
61 rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
62                                                 > ${TMP_DIR}/lock_state_first
63
64 # echo "with lock: `cat ${TMP_DIR}/lock_state_first`"
65
66 LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_first)"
67 [ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
68         || _fail "unexpected lock name: $LOCK_NAME"
69 LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_first)"
70 [ "$LOCK_TYPE" == "exclusive" ] \
71         || _fail "unexpected lock type: $LOCK_TYPE"
72
73 LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_first)"
74 [ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT"
75 LOCKER_COOKIE="$(jq -r '.lockers[0].cookie' ${TMP_DIR}/lock_state_first)"
76 [ "$LOCKER_COOKIE" == "ctdb_reclock_mutex" ] \
77         || _fail "unexpected locker cookie: $LOCKER_COOKIE"
78 LOCKER_DESC="$(jq -r '.lockers[0].description' ${TMP_DIR}/lock_state_first)"
79 [ "$LOCKER_DESC" == "CTDB recovery lock" ] \
80         || _fail "unexpected locker description: $LOCKER_DESC"
81
82 # second attempt while first is still holding the lock - expect failure
83 ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" \
84                                                         > ${TMP_DIR}/second
85 second_out=$(cat ${TMP_DIR}/second)
86 [ "$second_out" == "1" ] \
87         || _fail "expected lock contention (1), but got $second_out"
88
89 # confirm lock state didn't change
90 rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
91                                                 > ${TMP_DIR}/lock_state_second
92
93 diff ${TMP_DIR}/lock_state_first ${TMP_DIR}/lock_state_second \
94                                         || _fail "unexpected lock state change"
95
96 # tell first locker to drop the lock and terminate
97 kill $locker_pid || exit 1
98
99 wait $locker_pid &> /dev/null
100
101 rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
102                                                 > ${TMP_DIR}/lock_state_third
103 # echo "without lock: `cat ${TMP_DIR}/lock_state_third`"
104
105 LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_third)"
106 [ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
107         || _fail "unexpected lock name: $LOCK_NAME"
108 LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_third)"
109 [ "$LOCK_TYPE" == "exclusive" ] \
110         || _fail "unexpected lock type: $LOCK_TYPE"
111
112 LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_third)"
113 [ $LOCK_COUNT -eq 0 ] \
114         || _fail "didn\'t expect any locks in rados state, got $LOCK_COUNT"
115
116 exec >${TMP_DIR}/third -- ctdb_mutex_ceph_rados_helper "$CLUSTER" "$USER" "$POOL" "$OBJECT" &
117 locker_pid=$!
118
119 sleep 1
120
121 rados -p "$POOL" lock info "$OBJECT" ctdb_reclock_mutex \
122                                                 > ${TMP_DIR}/lock_state_fourth
123 # echo "with lock again: `cat ${TMP_DIR}/lock_state_fourth`"
124
125 LOCK_NAME="$(jq -r '.name' ${TMP_DIR}/lock_state_fourth)"
126 [ "$LOCK_NAME" == "ctdb_reclock_mutex" ] \
127         || _fail "unexpected lock name: $LOCK_NAME"
128 LOCK_TYPE="$(jq -r '.type' ${TMP_DIR}/lock_state_fourth)"
129 [ "$LOCK_TYPE" == "exclusive" ] \
130         || _fail "unexpected lock type: $LOCK_TYPE"
131
132 LOCK_COUNT="$(jq -r '.lockers | length' ${TMP_DIR}/lock_state_fourth)"
133 [ $LOCK_COUNT -eq 1 ] || _fail "expected 1 lock in rados state, got $LOCK_COUNT"
134 LOCKER_COOKIE="$(jq -r '.lockers[0].cookie' ${TMP_DIR}/lock_state_fourth)"
135 [ "$LOCKER_COOKIE" == "ctdb_reclock_mutex" ] \
136         || _fail "unexpected locker cookie: $LOCKER_COOKIE"
137 LOCKER_DESC="$(jq -r '.lockers[0].description' ${TMP_DIR}/lock_state_fourth)"
138 [ "$LOCKER_DESC" == "CTDB recovery lock" ] \
139         || _fail "unexpected locker description: $LOCKER_DESC"
140
141 kill $locker_pid || exit 1
142 wait $locker_pid &> /dev/null
143
144 third_out=$(cat ${TMP_DIR}/third)
145 [ "$third_out" == "0" ] \
146         || _fail "expected lock acquisition (0), but got $third_out"
147
148 rm ${TMP_DIR}/*
149 rmdir $TMP_DIR
150
151 echo "$0: all tests passed"