ctdb-daemon: Add AllowMixedVersions tunable
[samba.git] / ctdb / server / ctdb_keepalive.c
1 /* 
2    monitoring links to all other nodes to detect dead nodes
3
4
5    Copyright (C) Ronnie Sahlberg 2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "replace.h"
22 #include "system/filesys.h"
23 #include "system/network.h"
24 #include "system/time.h"
25 #include "system/wait.h"
26
27 #include <talloc.h>
28 #include <tevent.h>
29
30 #include "lib/util/debug.h"
31 #include "lib/util/samba_util.h"
32
33 #include "ctdb_private.h"
34 #include "version.h"
35
36 #include "common/common.h"
37 #include "common/logging.h"
38
39
40 static uint32_t keepalive_version(void)
41 {
42         return (SAMBA_VERSION_MAJOR << 16) | SAMBA_VERSION_MINOR;
43 }
44
45 static uint32_t keepalive_uptime(struct ctdb_context *ctdb)
46 {
47         struct timeval current = tevent_timeval_current();
48
49         return current.tv_sec - ctdb->ctdbd_start_time.tv_sec;
50 }
51
52 /*
53    send a keepalive packet to the other node
54 */
55 static void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
56 {
57         struct ctdb_req_keepalive_old *r;
58
59         if (ctdb->methods == NULL) {
60                 DEBUG(DEBUG_INFO,
61                       ("Failed to send keepalive. Transport is DOWN\n"));
62                 return;
63         }
64
65         r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_KEEPALIVE,
66                                     sizeof(struct ctdb_req_keepalive_old),
67                                     struct ctdb_req_keepalive_old);
68         CTDB_NO_MEMORY_FATAL(ctdb, r);
69         r->hdr.destnode  = destnode;
70         r->hdr.reqid     = 0;
71
72         r->version = keepalive_version();
73         r->uptime = keepalive_uptime(ctdb);
74
75         CTDB_INCREMENT_STAT(ctdb, keepalive_packets_sent);
76
77         ctdb_queue_packet(ctdb, &r->hdr);
78
79         talloc_free(r);
80 }
81
82 /*
83   see if any nodes are dead
84  */
85 static void ctdb_check_for_dead_nodes(struct tevent_context *ev,
86                                       struct tevent_timer *te,
87                                       struct timeval t, void *private_data)
88 {
89         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
90         int i;
91
92         /* send a keepalive to all other nodes, unless */
93         for (i=0;i<ctdb->num_nodes;i++) {
94                 struct ctdb_node *node = ctdb->nodes[i];
95
96                 if (node->flags & NODE_FLAGS_DELETED) {
97                         continue;
98                 }
99
100                 if (node->pnn == ctdb->pnn) {
101                         continue;
102                 }
103                 
104                 if (node->flags & NODE_FLAGS_DISCONNECTED) {
105                         /* it might have come alive again */
106                         if (node->rx_cnt != 0) {
107                                 ctdb_node_connected(node);
108                         }
109                         continue;
110                 }
111
112
113                 if (node->rx_cnt == 0) {
114                         node->dead_count++;
115                 } else {
116                         node->dead_count = 0;
117                 }
118
119                 node->rx_cnt = 0;
120
121                 if (node->dead_count >= ctdb->tunable.keepalive_limit) {
122                         DEBUG(DEBUG_NOTICE,("dead count reached for node %u\n", node->pnn));
123                         ctdb_node_dead(node);
124                         ctdb_send_keepalive(ctdb, node->pnn);
125                         /* maybe tell the transport layer to kill the
126                            sockets as well?
127                         */
128                         continue;
129                 }
130                 
131                 DEBUG(DEBUG_DEBUG,("sending keepalive to %u\n", node->pnn));
132                 ctdb_send_keepalive(ctdb, node->pnn);
133
134                 node->tx_cnt = 0;
135         }
136
137         tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
138                          timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
139                          ctdb_check_for_dead_nodes, ctdb);
140 }
141
142
143 void ctdb_start_keepalive(struct ctdb_context *ctdb)
144 {
145         struct tevent_timer *te;
146
147         ctdb->keepalive_ctx = talloc_new(ctdb);
148         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx);
149
150         te = tevent_add_timer(ctdb->ev, ctdb->keepalive_ctx,
151                               timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
152                               ctdb_check_for_dead_nodes, ctdb);
153         CTDB_NO_MEMORY_FATAL(ctdb, te);
154
155         DEBUG(DEBUG_NOTICE,("Keepalive monitoring has been started\n"));
156
157         if (ctdb->tunable.allow_mixed_versions == 1) {
158                 DEBUG(DEBUG_WARNING,
159                       ("CTDB cluster with mixed versions configured\n"));
160         }
161 }
162
163 void ctdb_stop_keepalive(struct ctdb_context *ctdb)
164 {
165         talloc_free(ctdb->keepalive_ctx);
166         ctdb->keepalive_ctx = NULL;
167 }
168
169 void ctdb_request_keepalive(struct ctdb_context *ctdb,
170                             struct ctdb_req_header *hdr)
171 {
172         struct ctdb_req_keepalive_old *c =
173                 (struct ctdb_req_keepalive_old *)hdr;
174         uint32_t my_version = keepalive_version();
175         uint32_t my_uptime = keepalive_uptime(ctdb);
176
177         /* Don't check anything if mixed versions are allowed */
178         if (ctdb->tunable.allow_mixed_versions == 1) {
179                 return;
180         }
181
182         if (hdr->length == sizeof(struct ctdb_req_header)) {
183                 /* Old keepalive */
184                 goto fail1;
185         }
186
187         if (c->version != my_version) {
188                 if (c->uptime > my_uptime) {
189                         goto fail2;
190                 } else if (c->uptime == my_uptime) {
191                         if (c->version > my_version) {
192                                 goto fail2;
193                         }
194                 }
195         }
196
197         return;
198
199 fail1:
200         DEBUG(DEBUG_ERR,
201               ("Keepalive version missing from node %u\n", hdr->srcnode));
202         goto shutdown;
203
204 fail2:
205         DEBUG(DEBUG_ERR,
206               ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n",
207                my_version, c->version, hdr->srcnode));
208         goto shutdown;
209
210 shutdown:
211         DEBUG(DEBUG_ERR,
212               ("CTDB Cluster with mixed versions, cannot continue\n"));
213         ctdb_shutdown_sequence(ctdb, 0);
214 }