From dc2bf2f7d27913f29f5b6c92e41c6b59fa31d163 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 6 Sep 2012 20:22:38 +1000 Subject: [PATCH] common: Debug ctdb_addr_to_str() using new function ctdb_external_trace() We've seen this function report "Unknown family, 0" and then CTDB disappeared without a trace. If we can reproduce it then this might help us to debug it. The idea is that you do something like the following in /etc/sysconfig/ctdb: export CTDB_EXTERNAL_TRACE="/etc/ctdb/config/gcore_trace.sh" When we hit this error than we call out to gcore to get a core file so we can do forensics. This might block CTDB for a few seconds. Signed-off-by: Martin Schwenke --- Makefile.in | 1 + common/ctdb_util.c | 25 +++++++++++++++++++++++++ config/gcore_trace.sh | 3 +++ include/ctdb_private.h | 1 + packaging/RPM/ctdb.spec.in | 1 + 5 files changed, 31 insertions(+) create mode 100755 config/gcore_trace.sh diff --git a/Makefile.in b/Makefile.in index c97a9caf..e3d6d5bf 100755 --- a/Makefile.in +++ b/Makefile.in @@ -291,6 +291,7 @@ install: all if [ -f doc/ltdbtool.1 ]; then ${INSTALLCMD} -m 644 doc/ltdbtool.1 $(DESTDIR)$(mandir)/man1; fi if [ ! -f $(DESTDIR)$(etcdir)/ctdb/notify.sh ];then ${INSTALLCMD} -m 755 config/notify.sh $(DESTDIR)$(etcdir)/ctdb; fi if [ ! -f $(DESTDIR)$(etcdir)/ctdb/ctdb-crash-cleanup.sh ];then ${INSTALLCMD} -m 755 config/ctdb-crash-cleanup.sh $(DESTDIR)$(etcdir)/ctdb; fi + if [ ! -f $(DESTDIR)$(etcdir)/ctdb/gcore_trace.sh ];then ${INSTALLCMD} -m 755 config/gcore_trace.sh $(DESTDIR)$(etcdir)/ctdb; fi test: all tests/run_tests.sh diff --git a/common/ctdb_util.c b/common/ctdb_util.c index 061c16d2..bb212f58 100644 --- a/common/ctdb_util.c +++ b/common/ctdb_util.c @@ -60,6 +60,30 @@ void ctdb_fatal(struct ctdb_context *ctdb, const char *msg) abort(); } +/* Invoke an external program to do some sort of tracing on the CTDB + * process. This might block for a little while. The external + * program is specified by the environment variable + * CTDB_EXTERNAL_TRACE. This program should take one argument: the + * pid of the process to trace. Commonly, the program would be a + * wrapper script around gcore. + */ +void ctdb_external_trace(void) +{ + + const char * t = getenv("CTDB_EXTERNAL_TRACE"); + char * cmd; + + if (t == NULL) { + return; + } + + cmd = talloc_asprintf(NULL, "%s %lu", t, (unsigned long) getpid()); + DEBUG(DEBUG_WARNING,("begin external trace: %s\n", cmd)); + system(cmd); + DEBUG(DEBUG_WARNING,("end external trace: %s\n", cmd)); + talloc_free(cmd); +} + /* parse a IP:port pair */ @@ -574,6 +598,7 @@ char *ctdb_addr_to_str(ctdb_sock_addr *addr) break; default: DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family %u\n", addr->sa.sa_family)); + ctdb_external_trace(); } return cip; diff --git a/config/gcore_trace.sh b/config/gcore_trace.sh new file mode 100755 index 00000000..4d3e1d1d --- /dev/null +++ b/config/gcore_trace.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +gcore -o "/var/log/core" "$1" 2>&1 | logger -t "ctdb:gcore_trace" diff --git a/include/ctdb_private.h b/include/ctdb_private.h index 3a5d3cf6..b5bd45c1 100644 --- a/include/ctdb_private.h +++ b/include/ctdb_private.h @@ -655,6 +655,7 @@ struct ctdb_fetch_handle { /* internal prototypes */ void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3); void ctdb_fatal(struct ctdb_context *ctdb, const char *msg); +void ctdb_external_trace(void); bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2); int ctdb_parse_address(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, const char *str, diff --git a/packaging/RPM/ctdb.spec.in b/packaging/RPM/ctdb.spec.in index 2d744b20..cb58a5ba 100644 --- a/packaging/RPM/ctdb.spec.in +++ b/packaging/RPM/ctdb.spec.in @@ -89,6 +89,7 @@ rm -rf $RPM_BUILD_ROOT %config(noreplace) %{_sysconfdir}/sysconfig/ctdb %config(noreplace) %{_sysconfdir}/ctdb/notify.sh %config(noreplace) %{_sysconfdir}/ctdb/ctdb-crash-cleanup.sh +%config(noreplace) %{_sysconfdir}/ctdb/gcore_trace.sh %config(noreplace) %{_sysconfdir}/ctdb/functions %attr(755,root,root) %{initdir}/ctdb -- 2.34.1