1420b5256f3e753de6e65c58763f139b0bee4600
[obnox/samba/samba-obnox.git] / ctdb / server / ctdb_monitor.c
1 /* 
2    monitoring links to all other nodes to detect dead nodes
3
4
5    Copyright (C) Ronnie Sahlberg 2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "replace.h"
22 #include "system/filesys.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25
26 #include <talloc.h>
27 #include <tevent.h>
28
29 #include "lib/util/debug.h"
30 #include "lib/util/samba_util.h"
31
32 #include "ctdb_private.h"
33 #include "ctdb_logging.h"
34
35 #include "common/system.h"
36 #include "common/common.h"
37
38 struct ctdb_monitor_state {
39         uint32_t monitoring_mode;
40         TALLOC_CTX *monitor_context;
41         uint32_t next_interval;
42 };
43
44 static void ctdb_check_health(struct tevent_context *ev,
45                               struct tevent_timer *te,
46                               struct timeval t, void *private_data);
47
48 /*
49   setup the notification script
50 */
51 int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script)
52 {
53         ctdb->notification_script = talloc_strdup(ctdb, script);
54         CTDB_NO_MEMORY(ctdb, ctdb->notification_script);
55         return 0;
56 }
57
58 static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event)
59 {
60         struct stat st;
61         int ret;
62         char *cmd;
63
64         if (stat(ctdb->notification_script, &st) != 0) {
65                 DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script));
66                 return -1;
67         }
68         if (!(st.st_mode & S_IXUSR)) {
69                 DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script));
70                 return -1;
71         }
72
73         cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event);
74         CTDB_NO_MEMORY(ctdb, cmd);
75
76         ret = system(cmd);
77         /* if the system() call was successful, translate ret into the
78            return code from the command
79         */
80         if (ret != -1) {
81                 ret = WEXITSTATUS(ret);
82         }
83         if (ret != 0) {
84                 DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret));
85         }
86
87         return ret;
88 }
89
90 void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event)
91 {
92         pid_t child;
93
94         if (ctdb->notification_script == NULL) {
95                 return;
96         }
97
98         child = ctdb_fork(ctdb);
99         if (child == (pid_t)-1) {
100                 DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n"));
101                 return;
102         }
103         if (child == 0) {
104                 int ret;
105
106                 ctdb_set_process_name("ctdb_notification");
107                 debug_extra = talloc_asprintf(NULL, "notification-%s:", event);
108                 ret = ctdb_run_notification_script_child(ctdb, event);
109                 if (ret != 0) {
110                         DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n"));
111                 }
112                 _exit(0);
113         }
114
115         return;
116 }
117
118 /*
119   called when a health monitoring event script finishes
120  */
121 static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
122 {
123         struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
124         TDB_DATA data;
125         struct ctdb_node_flag_change c;
126         uint32_t next_interval;
127         int ret;
128         TDB_DATA rddata;
129         struct ctdb_srvid_message rd;
130         const char *state_str = NULL;
131
132         c.pnn = ctdb->pnn;
133         c.old_flags = node->flags;
134
135         rd.pnn   = ctdb->pnn;
136         rd.srvid = CTDB_SRVID_TAKEOVER_RUN_RESPONSE;
137
138         rddata.dptr = (uint8_t *)&rd;
139         rddata.dsize = sizeof(rd);
140
141         if (status == -ECANCELED) {
142                 DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n"));
143                 goto after_change_status;
144         }
145
146         if (status == -ETIME) {
147                 ctdb->event_script_timeouts++;
148
149                 if (ctdb->event_script_timeouts >= ctdb->tunable.script_timeout_count) {
150                         DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_timeout_count));
151                 } else {
152                         /* We pretend this is OK. */
153                         goto after_change_status;
154                 }
155         }
156
157         if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
158                 DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
159                 node->flags |= NODE_FLAGS_UNHEALTHY;
160                 ctdb->monitor->next_interval = 5;
161
162                 ctdb_run_notification_script(ctdb, "unhealthy");
163         } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
164                 DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
165                 node->flags &= ~NODE_FLAGS_UNHEALTHY;
166                 ctdb->monitor->next_interval = 5;
167
168                 ctdb_run_notification_script(ctdb, "healthy");
169         }
170
171 after_change_status:
172         next_interval = ctdb->monitor->next_interval;
173
174         ctdb->monitor->next_interval *= 2;
175         if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
176                 ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
177         }
178
179         tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
180                          timeval_current_ofs(next_interval, 0),
181                          ctdb_check_health, ctdb);
182
183         if (c.old_flags == node->flags) {
184                 return;
185         }
186
187         c.new_flags = node->flags;
188
189         data.dptr = (uint8_t *)&c;
190         data.dsize = sizeof(c);
191
192         /* ask the recovery daemon to push these changes out to all nodes */
193         ctdb_daemon_send_message(ctdb, ctdb->pnn,
194                                  CTDB_SRVID_PUSH_NODE_FLAGS, data);
195
196         if (c.new_flags & NODE_FLAGS_UNHEALTHY) {
197                 state_str = "UNHEALTHY";
198         } else {
199                 state_str = "HEALTHY";
200         }
201
202         /* ask the recmaster to reallocate all addresses */
203         DEBUG(DEBUG_ERR,
204               ("Node became %s. Ask recovery master to reallocate IPs\n",
205                state_str));
206         ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_TAKEOVER_RUN, rddata);
207         if (ret != 0) {
208                 DEBUG(DEBUG_ERR,
209                       (__location__
210                        " Failed to send IP takeover run request\n"));
211         }
212 }
213
214
215 static void ctdb_run_startup(struct tevent_context *ev,
216                              struct tevent_timer *te,
217                              struct timeval t, void *private_data);
218 /*
219   called when the startup event script finishes
220  */
221 static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p)
222 {
223         if (status != 0) {
224                 DEBUG(DEBUG_ERR,("startup event failed\n"));
225                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
226                                  timeval_current_ofs(5, 0),
227                                  ctdb_run_startup, ctdb);
228                 return;
229         }
230
231         DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
232         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
233         ctdb->monitor->next_interval = 2;
234         ctdb_run_notification_script(ctdb, "startup");
235
236         ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
237
238         tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
239                          timeval_current_ofs(ctdb->monitor->next_interval, 0),
240                          ctdb_check_health, ctdb);
241 }
242
243 static void ctdb_run_startup(struct tevent_context *ev,
244                              struct tevent_timer *te,
245                              struct timeval t, void *private_data)
246 {
247         struct ctdb_context *ctdb = talloc_get_type(private_data,
248                                                     struct ctdb_context);
249         int ret;
250
251         /* This is necessary to avoid the "startup" event colliding
252          * with the "ipreallocated" event from the takeover run
253          * following the first recovery.  We might as well serialise
254          * these things if we can.
255          */
256         if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
257                 DEBUG(DEBUG_NOTICE,
258                       ("Not yet in startup runstate. Wait one more second\n"));
259                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
260                                  timeval_current_ofs(1, 0),
261                                  ctdb_run_startup, ctdb);
262                 return;
263         }
264
265         /* release any IPs we hold from previous runs of the daemon */
266         ctdb_release_all_ips(ctdb);
267
268         DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
269         ret = ctdb_event_script_callback(ctdb,
270                                          ctdb->monitor->monitor_context,
271                                          ctdb_startup_callback,
272                                          ctdb, CTDB_EVENT_STARTUP, "%s", "");
273
274         if (ret != 0) {
275                 DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
276                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
277                                  timeval_current_ofs(5, 0),
278                                  ctdb_run_startup, ctdb);
279         }
280 }
281
282 /*
283   wait until we have finished initial recoveries before we start the
284   monitoring events
285  */
286 static void ctdb_wait_until_recovered(struct tevent_context *ev,
287                                       struct tevent_timer *te,
288                                       struct timeval t, void *private_data)
289 {
290         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
291         int ret;
292         static int count = 0;
293
294         count++;
295
296         if (count < 60 || count%600 == 0) { 
297                 DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
298                 if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) {
299                         DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n"));
300                 }
301         }
302
303         if (ctdb->vnn_map->generation == INVALID_GENERATION) {
304                 ctdb->db_persistent_startup_generation = INVALID_GENERATION;
305
306                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
307                                  timeval_current_ofs(1, 0),
308                                  ctdb_wait_until_recovered, ctdb);
309                 return;
310         }
311
312         if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
313                 ctdb->db_persistent_startup_generation = INVALID_GENERATION;
314
315                 DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
316                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
317                                  timeval_current_ofs(1, 0),
318                                  ctdb_wait_until_recovered, ctdb);
319                 return;
320         }
321
322
323         if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
324                 ctdb->db_persistent_startup_generation = INVALID_GENERATION;
325
326                 DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
327
328                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
329                                  timeval_current_ofs(1, 0),
330                                  ctdb_wait_until_recovered, ctdb);
331                 return;
332         }
333
334         if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
335                 DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
336                                   "until the next recovery\n"));
337                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
338                                  timeval_current_ofs(1, 0),
339                                  ctdb_wait_until_recovered, ctdb);
340                 return;
341         }
342
343         ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
344         ret = ctdb_recheck_persistent_health(ctdb);
345         if (ret != 0) {
346                 ctdb->db_persistent_check_errors++;
347                 if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
348                         DEBUG(ctdb->db_persistent_check_errors==1?DEBUG_ERR:DEBUG_WARNING,
349                               (__location__ "ctdb_recheck_persistent_health() "
350                               "failed (%llu of %llu times) - retry later\n",
351                               (unsigned long long)ctdb->db_persistent_check_errors,
352                               (unsigned long long)ctdb->max_persistent_check_errors));
353                         tevent_add_timer(ctdb->ev,
354                                          ctdb->monitor->monitor_context,
355                                          timeval_current_ofs(1, 0),
356                                          ctdb_wait_until_recovered, ctdb);
357                         return;
358                 }
359                 DEBUG(DEBUG_ALERT,(__location__
360                                   "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
361                                   (unsigned long long)ctdb->db_persistent_check_errors));
362                 ctdb_shutdown_sequence(ctdb, 11);
363                 /* In case above returns due to duplicate shutdown */
364                 return;
365         }
366         ctdb->db_persistent_check_errors = 0;
367
368         tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
369                          timeval_current(), ctdb_run_startup, ctdb);
370 }
371
372
373 /*
374   see if the event scripts think we are healthy
375  */
376 static void ctdb_check_health(struct tevent_context *ev,
377                               struct tevent_timer *te,
378                               struct timeval t, void *private_data)
379 {
380         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
381         bool skip_monitoring = false;
382         int ret = 0;
383
384         if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
385             ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED) {
386                 skip_monitoring = true;
387         } else {
388                 if (ctdb_db_all_frozen(ctdb)) {
389                         DEBUG(DEBUG_ERR,
390                               ("Skip monitoring since databases are frozen\n"));
391                         skip_monitoring = true;
392                 }
393         }
394
395         if (skip_monitoring) {
396                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
397                                  timeval_current_ofs(ctdb->monitor->next_interval, 0),
398                                  ctdb_check_health, ctdb);
399                 return;
400         }
401
402         ret = ctdb_event_script_callback(ctdb,
403                                          ctdb->monitor->monitor_context,
404                                          ctdb_health_callback,
405                                          ctdb, CTDB_EVENT_MONITOR, "%s", "");
406         if (ret != 0) {
407                 DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
408                 ctdb->monitor->next_interval = 5;
409                 tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
410                                  timeval_current_ofs(5, 0),
411                                  ctdb_check_health, ctdb);
412         }
413 }
414
415 /* 
416   (Temporaily) Disabling monitoring will stop the monitor event scripts
417   from running   but node health checks will still occur
418 */
419 void ctdb_disable_monitoring(struct ctdb_context *ctdb)
420 {
421         ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
422         DEBUG(DEBUG_INFO,("Monitoring has been disabled\n"));
423 }
424
425 /* 
426    Re-enable running monitor events after they have been disabled
427  */
428 void ctdb_enable_monitoring(struct ctdb_context *ctdb)
429 {
430         ctdb->monitor->monitoring_mode  = CTDB_MONITORING_ACTIVE;
431         ctdb->monitor->next_interval = 5;
432         DEBUG(DEBUG_INFO,("Monitoring has been enabled\n"));
433 }
434
435 /* stop any monitoring 
436    this should only be done when shutting down the daemon
437 */
438 void ctdb_stop_monitoring(struct ctdb_context *ctdb)
439 {
440         talloc_free(ctdb->monitor->monitor_context);
441         ctdb->monitor->monitor_context = NULL;
442
443         ctdb->monitor->monitoring_mode  = CTDB_MONITORING_DISABLED;
444         ctdb->monitor->next_interval = 5;
445         DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n"));
446 }
447
448 /*
449   start watching for nodes that might be dead
450  */
451 void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
452 {
453         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
454
455         ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
456         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
457
458         ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
459         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
460
461         tevent_add_timer(ctdb->ev, ctdb->monitor->monitor_context,
462                          timeval_current_ofs(1, 0),
463                          ctdb_wait_until_recovered, ctdb);
464 }
465
466
467 /*
468   modify flags on a node
469  */
470 int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
471 {
472         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr;
473         struct ctdb_node *node;
474         uint32_t old_flags;
475
476         if (c->pnn >= ctdb->num_nodes) {
477                 DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes));
478                 return -1;
479         }
480
481         node         = ctdb->nodes[c->pnn];
482         old_flags    = node->flags;
483         if (c->pnn != ctdb->pnn) {
484                 c->old_flags  = node->flags;
485         }
486         node->flags   = c->new_flags & ~NODE_FLAGS_DISCONNECTED;
487         node->flags  |= (c->old_flags & NODE_FLAGS_DISCONNECTED);
488
489         /* we don't let other nodes modify our STOPPED status */
490         if (c->pnn == ctdb->pnn) {
491                 node->flags &= ~NODE_FLAGS_STOPPED;
492                 if (old_flags & NODE_FLAGS_STOPPED) {
493                         node->flags |= NODE_FLAGS_STOPPED;
494                 }
495         }
496
497         /* we don't let other nodes modify our BANNED status */
498         if (c->pnn == ctdb->pnn) {
499                 node->flags &= ~NODE_FLAGS_BANNED;
500                 if (old_flags & NODE_FLAGS_BANNED) {
501                         node->flags |= NODE_FLAGS_BANNED;
502                 }
503         }
504
505         if (node->flags == c->old_flags) {
506                 DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags));
507                 return 0;
508         }
509
510         DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags));
511
512         if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) {
513                 DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n",
514                                   c->pnn));
515                 ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
516         }
517
518         /* tell the recovery daemon something has changed */
519         c->new_flags = node->flags;
520         ctdb_daemon_send_message(ctdb, ctdb->pnn,
521                                  CTDB_SRVID_SET_NODE_FLAGS, indata);
522
523         /* if we have become banned, we should go into recovery mode */
524         if ((node->flags & NODE_FLAGS_BANNED) && !(c->old_flags & NODE_FLAGS_BANNED) && (node->pnn == ctdb->pnn)) {
525                 ctdb_local_node_got_banned(ctdb);
526         }
527         
528         return 0;
529 }
530
531 /*
532   return the monitoring mode
533  */
534 int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb)
535 {
536         if (ctdb->monitor == NULL) {
537                 return CTDB_MONITORING_DISABLED;
538         }
539         return ctdb->monitor->monitoring_mode;
540 }
541
542 /*
543  * Check if monitoring has been stopped
544  */
545 bool ctdb_stopped_monitoring(struct ctdb_context *ctdb)
546 {
547         return (ctdb->monitor->monitor_context == NULL ? true : false);
548 }