ctdb/pmda: fix num_recoveries metric store
[metze/samba/wip.git] / ctdb / utils / pmda / pmda_ctdb.c
1 /*
2  * CTDB Performance Metrics Domain Agent (PMDA) for Performance Co-Pilot (PCP)
3  *
4  * Copyright (c) 1995,2004 Silicon Graphics, Inc.  All Rights Reserved.
5  * Copyright (c) 2011 David Disseldorp
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License as published by the
9  * Free Software Foundation; either version 2 of the License, or (at your
10  * option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, write to the Free Software Foundation, Inc.,
19  * 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
20  */
21
22 #include "replace.h"
23 #include "system/network.h"
24
25 #include <talloc.h>
26 #include <tevent.h>
27 #include <tdb.h>
28
29 #include "lib/util/time.h"
30 #include "lib/util/blocking.h"
31
32 #include "client/client.h"
33 #include "client/client_sync.h"
34
35 #include <pcp/pmapi.h>
36 #include <pcp/pmda.h>
37
38 #ifdef HAVE___PMID_INT
39 #include <pcp/impl.h>
40
41 #define pmID_cluster(id)        id->cluster
42 #define pmID_item(id)           id->item
43 #define pmGetProgname()         pmProgname
44 #define pmSetProgname(a)        __pmSetProgname(a)
45 #endif
46
47 #include "domain.h"
48
49 /*
50  * CTDB PMDA
51  *
52  * This PMDA connects to the locally running ctdbd daemon and pulls
53  * statistics for export via PCP. The ctdbd Unix domain socket path can be
54  * specified with the CTDB_SOCKET environment variable, otherwise the default
55  * path is used.
56  */
57
58 /*
59  * All metrics supported in this PMDA - one table entry for each.
60  * The 4th field specifies the serial number of the instance domain
61  * for the metric, and must be either PM_INDOM_NULL (denoting a
62  * metric that only ever has a single value), or the serial number
63  * of one of the instance domains declared in the instance domain table
64  * (i.e. in indomtab, above).
65  */
66 static pmdaMetric metrictab[] = {
67         /* num_clients */
68         { NULL, { PMDA_PMID(0,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
69                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
70         /* frozen */
71         { NULL, { PMDA_PMID(0,1), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
72                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
73         /* recovering */
74         { NULL, { PMDA_PMID(0,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
75                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
76         /* client_packets_sent */
77         { NULL, { PMDA_PMID(0,3), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
78                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
79         /* client_packets_recv */
80         { NULL, { PMDA_PMID(0,4), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
81                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
82         /* node_packets_sent */
83         { NULL, { PMDA_PMID(0,5), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
84                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
85         /* node_packets_recv */
86         { NULL, { PMDA_PMID(0,6), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
87                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
88         /* keepalive_packets_sent */
89         { NULL, { PMDA_PMID(0,7), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
90                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
91         /* keepalive_packets_recv */
92         { NULL, { PMDA_PMID(0,8), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
93                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
94         /* req_call */
95         { NULL, { PMDA_PMID(1,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
96                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
97         /* reply_call */
98         { NULL, { PMDA_PMID(1,1), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
99                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
100         /* req_dmaster */
101         { NULL, { PMDA_PMID(1,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
102                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
103         /* reply_dmaster */
104         { NULL, { PMDA_PMID(1,3), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
105                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
106         /* reply_error */
107         { NULL, { PMDA_PMID(1,4), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
108                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
109         /* req_message */
110         { NULL, { PMDA_PMID(1,5), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
111                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
112         /* req_control */
113         { NULL, { PMDA_PMID(1,6), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
114                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
115         /* reply_control */
116         { NULL, { PMDA_PMID(1,7), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
117                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
118         /* req_call */
119         { NULL, { PMDA_PMID(2,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
120                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
121         /* req_message */
122         { NULL, { PMDA_PMID(2,1), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
123                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
124         /* req_control */
125         { NULL, { PMDA_PMID(2,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
126                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
127         /* call */
128         { NULL, { PMDA_PMID(3,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
129                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
130         /* control */
131         { NULL, { PMDA_PMID(3,1), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
132                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
133         /* traverse */
134         { NULL, { PMDA_PMID(3,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
135                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
136         /* total_calls */
137         { NULL, { PMDA_PMID(0,9), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
138                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
139         /* pending_calls */
140         { NULL, { PMDA_PMID(0,10), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
141                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
142         /* locks.num_calls */
143         { NULL, { PMDA_PMID(0,11), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
144                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
145         /* locks.num_pending */
146         { NULL, { PMDA_PMID(0,12), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
147                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
148         /* childwrite_calls */
149         { NULL, { PMDA_PMID(0,13), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
150                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
151         /* pending_childwrite_calls */
152         { NULL, { PMDA_PMID(0,14), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
153                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
154         /* memory_used */
155         { NULL, { PMDA_PMID(0,15), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
156                 PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) }, },
157         /* max_hop_count */
158         { NULL, { PMDA_PMID(0,16), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
159                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
160         /* reclock.ctdbd.max */
161         { NULL, { PMDA_PMID(0,17), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
162                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
163         /* reclock.recd.max */
164         { NULL, { PMDA_PMID(0,18), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
165                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
166         /* call_latency.max */
167         { NULL, { PMDA_PMID(0,19), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
168                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
169         /* locks.latency.max */
170         { NULL, { PMDA_PMID(0,20), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
171                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
172         /* childwrite_latency.max */
173         { NULL, { PMDA_PMID(0,21), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
174                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
175         /* num_recoveries */
176         { NULL, { PMDA_PMID(0,22), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
177                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
178 };
179
180 static struct tevent_context *ev;
181 static struct ctdb_client_context *client;
182 static struct ctdb_statistics *stats;
183
184 static void
185 pmda_ctdb_disconnected(void *args)
186 {
187         fprintf(stderr, "ctdbd unreachable\n");
188         TALLOC_FREE(client);
189 }
190
191
192 static int
193 pmda_ctdb_daemon_connect(void)
194 {
195         const char *socket_name;
196         int ret;
197
198         ev = tevent_context_init(NULL);
199         if (ev == NULL) {
200                 fprintf(stderr, "Failed to init event ctx\n");
201                 return -1;
202         }
203
204         socket_name = getenv("CTDB_SOCKET");
205         if (socket_name == NULL) {
206                 socket_name = CTDB_SOCKET;
207         }
208
209         ret = ctdb_client_init(ev, ev, socket_name, &client);
210         if (ret != 0) {
211                 fprintf(stderr, "Failed to connect to ctdb daemon via %s\n",
212                         socket_name);
213                 goto err_ev;
214         }
215
216         ctdb_client_set_disconnect_callback(client, pmda_ctdb_disconnected,
217                                             NULL);
218
219         return 0;
220
221 err_ev:
222         talloc_free(ev);
223         client = NULL;
224         return -1;
225 }
226
227 static void
228 pmda_ctdb_daemon_disconnect(void)
229 {
230         TALLOC_FREE(client);
231         talloc_free(ev);
232 }
233
234 static int
235 fill_base(unsigned int item, pmAtomValue *atom)
236 {
237         switch (item) {
238         case 0:
239                 atom->ul = stats->num_clients;
240                 break;
241         case 1:
242                 atom->ul = stats->frozen;
243                 break;
244         case 2:
245                 atom->ul = stats->recovering;
246                 break;
247         case 3:
248                 atom->ul = stats->client_packets_sent;
249                 break;
250         case 4:
251                 atom->ul = stats->client_packets_recv;
252                 break;
253         case 5:
254                 atom->ul = stats->node_packets_sent;
255                 break;
256         case 6:
257                 atom->ul = stats->node_packets_recv;
258                 break;
259         case 7:
260                 atom->ul = stats->keepalive_packets_sent;
261                 break;
262         case 8:
263                 atom->ul = stats->keepalive_packets_recv;
264                 break;
265         case 9:
266                 atom->ul = stats->total_calls;
267                 break;
268         case 10:
269                 atom->ul = stats->pending_calls;
270                 break;
271         case 11:
272                 atom->ul = stats->locks.num_calls;
273                 break;
274         case 12:
275                 atom->ul = stats->locks.num_pending;
276                 break;
277         case 13:
278                 atom->ul = stats->childwrite_calls;
279                 break;
280         case 14:
281                 atom->ul = stats->pending_childwrite_calls;
282                 break;
283         case 15:
284                 atom->ul = stats->memory_used;
285                 break;
286         case 16:
287                 atom->ul = stats->max_hop_count;
288                 break;
289         case 17:
290                 atom->d = stats->reclock.ctdbd.max;
291                 break;
292         case 18:
293                 atom->d = stats->reclock.recd.max;
294                 break;
295         case 19:
296                 atom->d = stats->call_latency.max;
297                 break;
298         case 20:
299                 atom->d = stats->locks.latency.max;
300                 break;
301         case 21:
302                 atom->d = stats->childwrite_latency.max;
303                 break;
304         case 22:
305                 atom->ul = stats->num_recoveries;
306                 break;
307         default:
308                 return PM_ERR_PMID;
309         }
310
311         return 0;
312 }
313
314 static int
315 fill_node(unsigned int item, pmAtomValue *atom)
316 {
317         switch (item) {
318         case 0:
319                atom->ul = stats->node.req_call;
320                break;
321         case 1:
322                atom->ul = stats->node.reply_call;
323                break;
324         case 2:
325                atom->ul = stats->node.req_dmaster;
326                break;
327         case 3:
328                atom->ul = stats->node.reply_dmaster;
329                break;
330         case 4:
331                atom->ul = stats->node.reply_error;
332                break;
333         case 5:
334                atom->ul = stats->node.req_message;
335                break;
336         case 6:
337                atom->ul = stats->node.req_control;
338                break;
339         case 7:
340                 atom->ul = stats->node.reply_control;
341                 break;
342         default:
343                 return PM_ERR_PMID;
344         }
345
346         return 0;
347 }
348
349
350 static int
351 fill_client(unsigned int item, pmAtomValue *atom)
352 {
353         switch (item) {
354         case 0:
355                 atom->ul = stats->client.req_call;
356                 break;
357         case 1:
358                 atom->ul = stats->client.req_message;
359                 break;
360         case 2:
361                 atom->ul = stats->client.req_control;
362                 break;
363         default:
364                 return PM_ERR_PMID;
365         }
366
367         return 0;
368 }
369
370 static int
371 fill_timeout(unsigned int item, pmAtomValue *atom)
372 {
373         switch (item) {
374         case 0:
375                 atom->ul = stats->timeouts.call;
376                 break;
377         case 1:
378                 atom->ul = stats->timeouts.control;
379                 break;
380         case 2:
381                 atom->ul = stats->timeouts.traverse;
382                 break;
383         default:
384                 return PM_ERR_PMID;
385         }
386
387         return 0;
388 }
389
390 /*
391  * callback provided to pmdaFetch
392  */
393 static int
394 pmda_ctdb_fetch_cb(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
395 {
396         int ret;
397 #ifdef HAVE___PMID_INT
398         __pmID_int *id = (__pmID_int *)&(mdesc->m_desc.pmid);
399 #else
400         pmID id = *(pmID *)&(mdesc->m_desc.pmid);
401 #endif
402
403         if (inst != PM_IN_NULL) {
404                 return PM_ERR_INST;
405         }
406
407         if (stats == NULL) {
408                 fprintf(stderr, "stats not available\n");
409                 ret = PM_ERR_VALUE;
410                 goto err_out;
411         }
412
413
414         switch (pmID_cluster(id)) {
415         case 0:
416                 ret = fill_base(pmID_item(id), atom);
417                 if (ret) {
418                         goto err_out;
419                 }
420                 break;
421         case 1:
422                 ret = fill_node(pmID_item(id), atom);
423                 if (ret) {
424                         goto err_out;
425                 }
426                 break;
427         case 2:
428                 ret = fill_client(pmID_item(id), atom);
429                 if (ret) {
430                         goto err_out;
431                 }
432                 break;
433         case 3:
434                 ret = fill_timeout(pmID_item(id), atom);
435                 if (ret) {
436                         goto err_out;
437                 }
438                 break;
439         default:
440                 return PM_ERR_PMID;
441         }
442
443         ret = 0;
444 err_out:
445         return ret;
446 }
447
448 /*
449  * This routine is called once for each pmFetch(3) operation, so is a
450  * good place to do once-per-fetch functions, such as value caching or
451  * instance domain evaluation.
452  */
453 static int
454 pmda_ctdb_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
455 {
456         int ret;
457
458         if (client == NULL) {
459                 fprintf(stderr, "attempting reconnect to ctdbd\n");
460                 ret = pmda_ctdb_daemon_connect();
461                 if (ret < 0) {
462                         fprintf(stderr, "reconnect failed\n");
463                         return PM_ERR_VALUE;
464                 }
465         }
466
467         ret = ctdb_ctrl_statistics(client, ev, client, CTDB_CURRENT_NODE,
468                                    tevent_timeval_current_ofs(1,0), &stats);
469         if (ret != 0) {
470                 fprintf(stderr, "ctdb control for statistics failed, reconnecting\n");
471                 pmda_ctdb_daemon_disconnect();
472                 ret = PM_ERR_VALUE;
473                 goto err_out;
474         }
475
476         ret = pmdaFetch(numpmid, pmidlist, resp, pmda);
477
478         talloc_free(stats);
479 err_out:
480         return ret;
481 }
482
483 void pmda_ctdb_init(pmdaInterface *dp);
484
485 /*
486  * Initialise the agent
487  */
488 void
489 pmda_ctdb_init(pmdaInterface *dp)
490 {
491         if (dp->status != 0) {
492                 return;
493         }
494
495         dp->version.two.fetch = pmda_ctdb_fetch;
496         pmdaSetFetchCallBack(dp, pmda_ctdb_fetch_cb);
497
498         pmdaInit(dp, NULL, 0, metrictab,
499                  (sizeof(metrictab) / sizeof(metrictab[0])));
500 }
501
502 static char *
503 helpfile(void)
504 {
505         static char buf[MAXPATHLEN];
506
507         if (!buf[0]) {
508                 snprintf(buf, sizeof(buf), "%s/ctdb/help",
509                          pmGetConfig("PCP_PMDAS_DIR"));
510         }
511         return buf;
512 }
513
514 static void
515 usage(void)
516 {
517         fprintf(stderr, "Usage: %s [options]\n\n", pmGetProgname());
518         fputs("Options:\n"
519           "  -d domain        use domain (numeric) for metrics domain of PMDA\n"
520           "  -l logfile       write log into logfile rather than using default log name\n"
521           "\nExactly one of the following options may appear:\n"
522           "  -i port          expect PMCD to connect on given inet port (number or name)\n"
523           "  -p               expect PMCD to supply stdin/stdout (pipe)\n"
524           "  -u socket        expect PMCD to connect on given unix domain socket\n",
525           stderr);
526         exit(1);
527 }
528
529 /*
530  * Set up the agent if running as a daemon.
531  */
532 int
533 main(int argc, char **argv)
534 {
535         int err = 0;
536         char log_file[] = "pmda_ctdb.log";
537         pmdaInterface dispatch;
538
539         pmSetProgname(argv[0]);
540
541         pmdaDaemon(&dispatch, PMDA_INTERFACE_2, argv[0], CTDB,
542                    log_file, helpfile());
543
544         if (pmdaGetOpt(argc, argv, "d:i:l:pu:?", &dispatch, &err) != EOF) {
545                 err++;
546         }
547
548         if (err) {
549                 usage();
550         }
551
552         pmdaOpenLog(&dispatch);
553         pmda_ctdb_init(&dispatch);
554         pmdaConnect(&dispatch);
555         pmdaMain(&dispatch);
556
557         exit(0);
558 }
559