ctdbd: Backport use of external script to debug hung eventscript
authorMartin Schwenke <martin@meltin.net>
Thu, 17 May 2012 00:17:51 +0000 (10:17 +1000)
committerMartin Schwenke <martin@meltin.net>
Mon, 6 May 2013 23:36:30 +0000 (09:36 +1000)
This is a cherry-pick from 6e68797af67bee36f2bad045f94806e7e98f27e9,
combined with several recent fixes:

  8507303b525d20c74e8ec4e7c4f5f275945cd3b6
    scripts: debug-hung-script.sh doesn't need functions/loadconfig
  501461cc3e132d4adee9e91b5d4513a26bae2846
    ctdbd: Remove debug_hung_script_ctx
  0581f9a84e58764d194f4e04064c2c5b393c348b
    ctdbd: Remove command-line option --debug-hung-script
  3400b2ed34b6eb9496eb55f1aab6f89d2952060d
    ctdbd: Complain loudly if CTDB_DEBUG_HUNG_SCRIPT script isn't executable
  9b0d56b16775aa16f33bdfdf831256e085fa3339
    ctdbd: Don't use a fixed length buffer for the hung script command

Signed-off-by: Martin Schwenke <martin@meltin.net>
Cherry-pick-from: b86270fae7fd9f8a7a718e15d8c7436a918f28c4

Makefile.in
config/ctdb.init
config/ctdb.sysconfig
config/debug-hung-script.sh [new file with mode: 0644]
packaging/RPM/ctdb.spec.in
server/eventscript.c

index 5fa9e98c10df0471fc56f5d74a597f2720a94da0..9f55278a245e7bbac3e81a76457d455ccbc08ac2 100755 (executable)
@@ -272,6 +272,7 @@ install: all
        if [ -f doc/onnode.1 ];then ${INSTALLCMD} -m 644 doc/onnode.1 $(DESTDIR)$(mandir)/man1; fi
        if [ -f doc/ltdbtool.1 ]; then ${INSTALLCMD} -m 644 doc/ltdbtool.1 $(DESTDIR)$(mandir)/man1; fi
        if [ ! -f $(DESTDIR)$(etcdir)/ctdb/notify.sh ];then ${INSTALLCMD} -m 755 config/notify.sh $(DESTDIR)$(etcdir)/ctdb; fi
+       if [ ! -f $(DESTDIR)$(etcdir)/ctdb/debug-hung-script.sh ];then ${INSTALLCMD} -m 755 config/debug-hung-script.sh $(DESTDIR)$(etcdir)/ctdb; fi
        if [ ! -f $(DESTDIR)$(etcdir)/ctdb/ctdb-crash-cleanup.sh ];then ${INSTALLCMD} -m 755 config/ctdb-crash-cleanup.sh $(DESTDIR)$(etcdir)/ctdb; fi
 
 test: all
index d6493bd7db0d24072899878433f3eb2783df71ba..2b9902b3548703f505d8d722f06a9b135540d6c5 100755 (executable)
@@ -111,6 +111,11 @@ build_ctdb_options () {
     maybe_set "--max-persistent-check-errors" "$CTDB_MAX_PERSISTENT_CHECK_ERRORS"
 }
 
+export_debug_variables ()
+{
+    export CTDB_DEBUG_HUNG_SCRIPT
+}
+
 check_tdb () {
        local PDBASE=$1
 
@@ -239,6 +244,8 @@ start() {
 
     build_ctdb_options
 
+    export_debug_variables
+
     # make sure we drop any ips that might still be held if previous
     # instance of ctdb got killed with -9 or similar
     drop_all_public_ips
index 1f2edc4c9df2cfb5c99d2b14d124ec9229e16176..08a550f4c071a2996368a3f9d871b4ba178b1d15 100644 (file)
@@ -92,6 +92,9 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
 # a script to run when node health changes
 # CTDB_NOTIFY_SCRIPT=/etc/ctdb/notify.sh
 
+# a script to collect data when an eventscript has hung
+# CTDB_DEBUG_HUNG_SCRIPT=/etc/ctdb/debug-hung-script.sh
+
 # the directory to put the local ctdb database files in
 # defaults to /var/ctdb
 # CTDB_DBDIR=/var/ctdb
diff --git a/config/debug-hung-script.sh b/config/debug-hung-script.sh
new file mode 100644 (file)
index 0000000..dcf68ba
--- /dev/null
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+echo "Pstree output for the hung script:"
+pstree -p -a $1
index 1d9781d3039146fd1f0998757600c97eedc6cd63..848a3692a81d35a1f57e53fd847fe20781110e10 100644 (file)
@@ -88,6 +88,7 @@ rm -rf $RPM_BUILD_ROOT
 
 %config(noreplace) %{_sysconfdir}/sysconfig/ctdb
 %config(noreplace) %{_sysconfdir}/ctdb/notify.sh
+%config(noreplace) %{_sysconfdir}/ctdb/debug-hung-script.sh
 %config(noreplace) %{_sysconfdir}/ctdb/ctdb-crash-cleanup.sh
 %config(noreplace) %{_sysconfdir}/ctdb/functions
 %attr(755,root,root) %{initdir}/ctdb
index a1bcf0150d2b0052f90b5134900feb62645a3f44..05fb37dd6b6c7f61598ed71f0dbb154497dd99bb 100644 (file)
@@ -504,15 +504,14 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event
        }
 }
 
-static void debug_timeout(struct ctdb_event_script_state *state)
+static void ctdb_run_debug_hung_script(struct ctdb_context *ctdb, struct ctdb_event_script_state *state)
 {
        struct ctdb_script_wire *current = get_current_script(state);
        char *cmd;
        pid_t pid;
-       time_t t;
-       char tbuf[100], buf[200];
+       const char * debug_hung_script = ETCDIR "/ctdb/debug-hung-script.sh";
 
-       cmd = child_command_string(state->ctdb, state,
+       cmd = child_command_string(ctdb, state,
                                   state->from_user, current->name,
                                   state->call, state->options);
        CTDB_NO_MEMORY_VOID(state->ctdb, cmd);
@@ -521,26 +520,36 @@ static void debug_timeout(struct ctdb_event_script_state *state)
                         cmd, timeval_elapsed(&current->start), state->child));
        talloc_free(cmd);
 
-       t = time(NULL);
-       strftime(tbuf, sizeof(tbuf)-1, "%Y%m%d%H%M%S",  localtime(&t));
-       sprintf(buf, "{ pstree -p; cat /proc/locks; ls -li /var/ctdb/ /var/ctdb/persistent; }"
-                       " >/tmp/ctdb.event.%s.%d", tbuf, getpid());
-
-       pid = ctdb_fork(state->ctdb);
-       if (pid == 0) {
-               system(buf);
-               /* Now we can kill the child */
+       if (!ctdb_fork_with_logging(ctdb, ctdb, NULL, NULL, &pid)) {
+               DEBUG(DEBUG_ERR,("Failed to fork a child process with logging to track hung event script\n"));
                kill(state->child, SIGTERM);
-               exit(0);
+               return;
        }
        if (pid == -1) {
                DEBUG(DEBUG_ERR,("Fork for debug script failed : %s\n",
                                 strerror(errno)));
-       } else {
-               DEBUG(DEBUG_ERR,("Logged timedout eventscript : %s\n", buf));
-               /* Don't kill child until timeout done. */
-               state->child = 0;
+               kill(state->child, SIGTERM);
+               return;
        }
+       if (pid == 0) {
+               char *buf;
+
+               if (getenv("CTDB_DEBUG_HUNG_SCRIPT") != NULL) {
+                       debug_hung_script = getenv("CTDB_DEBUG_HUNG_SCRIPT");
+               }
+
+               buf = talloc_asprintf(NULL, "%s %d",
+                                     debug_hung_script, state->child);
+               system(buf);
+               talloc_free(buf);
+
+               /* Now we can kill the child */
+               kill(state->child, SIGTERM);
+               _exit(0);
+       }
+
+       /* Don't kill child until timeout done. */
+       state->child = 0;
 }
 
 /* called when child times out */
@@ -564,10 +573,11 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
        case CTDB_EVENT_STATUS:
                state->scripts->scripts[state->current].status = 0;
                DEBUG(DEBUG_ERR,("Ignoring hung script for %s call %d\n", state->options, state->call));
+               ctdb_run_debug_hung_script(ctdb, state);
                break;
         default:
                state->scripts->scripts[state->current].status = -ETIME;
-               debug_timeout(state);
+               ctdb_run_debug_hung_script(ctdb, state);
        }
 
        talloc_free(state);