Merge remote branch 'ddiss/master_pmda_namespace'
[sahlberg/ctdb.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5    Copyright (C) Ronnie Sahlberg 2011
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include <ctdb.h>
21 #include <poll.h>
22 #include <errno.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <stdlib.h>
26 #include <sys/socket.h>
27 #include <sys/un.h>
28 #include <sys/ioctl.h>
29 #include "libctdb_private.h"
30 #include "io_elem.h"
31 #include "local_tdb.h"
32 #include "messages.h"
33 #include <dlinklist.h>
34 #include <ctdb_protocol.h>
35
36 /* Remove type-safety macros. */
37 #undef ctdb_attachdb_send
38 #undef ctdb_readrecordlock_async
39 #undef ctdb_readonlyrecordlock_async
40 #undef ctdb_connect
41
42 struct ctdb_lock {
43         struct ctdb_lock *next, *prev;
44
45         struct ctdb_db *ctdb_db;
46         TDB_DATA key;
47
48         /* Is this a request for read-only lock ? */
49         bool readonly;
50
51         /* This will always be set by the time user sees this. */
52         unsigned long held_magic;
53         struct ctdb_ltdb_header *hdr;
54
55         /* For convenience, we stash original callback here. */
56         ctdb_rrl_callback_t callback;
57 };
58
59 struct ctdb_db {
60         struct ctdb_connection *ctdb;
61         bool persistent;
62         uint32_t tdb_flags;
63         uint32_t id;
64         struct tdb_context *tdb;
65
66         ctdb_callback_t callback;
67         void *private_data;
68 };
69
70 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
71 {
72         DLIST_REMOVE(ctdb->locks, lock);
73 }
74
75 /* FIXME: for thread safety, need tid info too. */
76 static bool holding_lock(struct ctdb_connection *ctdb)
77 {
78         /* For the moment, you can't ever hold more than 1 lock. */
79         return (ctdb->locks != NULL);
80 }
81
82 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
83 {
84         DLIST_ADD(ctdb->locks, lock);
85 }
86
87 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
88 {
89         struct ctdb_lock *i, *next;
90
91         for (i = ctdb->locks; i; i = next) {
92                 /* Grab next pointer, as release_lock will free i */
93                 next = i->next;
94                 if (i->ctdb_db == db) {
95                         ctdb_release_lock(db, i);
96                 }
97         }
98 }
99
100 /* FIXME: Could be in shared util code with rest of ctdb */
101 static void close_noerr(int fd)
102 {
103         int olderr = errno;
104         close(fd);
105         errno = olderr;
106 }
107
108 /* FIXME: Could be in shared util code with rest of ctdb */
109 static void free_noerr(void *p)
110 {
111         int olderr = errno;
112         free(p);
113         errno = olderr;
114 }
115
116 /* FIXME: Could be in shared util code with rest of ctdb */
117 static void set_nonblocking(int fd)
118 {
119         unsigned v;
120         v = fcntl(fd, F_GETFL, 0);
121         fcntl(fd, F_SETFL, v | O_NONBLOCK);
122 }
123
124 /* FIXME: Could be in shared util code with rest of ctdb */
125 static void set_close_on_exec(int fd)
126 {
127         unsigned v;
128         v = fcntl(fd, F_GETFD, 0);
129         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
130 }
131
132 static void set_pnn(struct ctdb_connection *ctdb,
133                     struct ctdb_request *req,
134                     void *unused)
135 {
136         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
137                 DEBUG(ctdb, LOG_CRIT,
138                       "ctdb_connect(async): failed to get pnn");
139                 ctdb->broken = true;
140         }
141         ctdb_request_free(req);
142 }
143
144 struct ctdb_connection *ctdb_connect(const char *addr,
145                                      ctdb_log_fn_t log_fn, void *log_priv)
146 {
147         struct ctdb_connection *ctdb;
148         struct sockaddr_un sun;
149
150         ctdb = malloc(sizeof(*ctdb));
151         if (!ctdb) {
152                 /* With no format string, we hope it doesn't use ap! */
153                 va_list ap;
154                 memset(&ap, 0, sizeof(ap));
155                 errno = ENOMEM;
156                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
157                 goto fail;
158         }
159         ctdb->pnn = -1;
160         ctdb->outq = NULL;
161         ctdb->doneq = NULL;
162         ctdb->in = NULL;
163         ctdb->inqueue = NULL;
164         ctdb->message_handlers = NULL;
165         ctdb->next_id = 0;
166         ctdb->broken = false;
167         ctdb->log = log_fn;
168         ctdb->log_priv = log_priv;
169         ctdb->locks = NULL;
170
171         memset(&sun, 0, sizeof(sun));
172         sun.sun_family = AF_UNIX;
173         if (!addr)
174                 addr = CTDB_PATH;
175         strncpy(sun.sun_path, addr, sizeof(sun.sun_path)-1);
176         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
177         if (ctdb->fd < 0)
178                 goto free_fail;
179
180         set_nonblocking(ctdb->fd);
181         set_close_on_exec(ctdb->fd);
182
183         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
184                 goto close_fail;
185
186         /* Immediately queue a request to get our pnn. */
187         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
188                 goto close_fail;
189
190         return ctdb;
191
192 close_fail:
193         close_noerr(ctdb->fd);
194 free_fail:
195         free_noerr(ctdb);
196 fail:
197         return NULL;
198 }
199
200 void ctdb_disconnect(struct ctdb_connection *ctdb)
201 {
202         struct ctdb_request *i;
203
204         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
205
206         while ((i = ctdb->outq) != NULL) {
207                 DLIST_REMOVE(ctdb->outq, i);
208                 ctdb_request_free(i);
209         }
210
211         while ((i = ctdb->doneq) != NULL) {
212                 DLIST_REMOVE(ctdb->doneq, i);
213                 ctdb_request_free(i);
214         }
215
216         if (ctdb->in)
217                 free_io_elem(ctdb->in);
218
219         remove_message_handlers(ctdb);
220
221         close(ctdb->fd);
222         /* Just in case they try to reuse */
223         ctdb->fd = -1;
224         free(ctdb);
225 }
226
227 int ctdb_get_fd(struct ctdb_connection *ctdb)
228 {
229         return ctdb->fd;
230 }
231
232 int ctdb_which_events(struct ctdb_connection *ctdb)
233 {
234         int events = POLLIN;
235
236         if (ctdb->outq)
237                 events |= POLLOUT;
238         return events;
239 }
240
241 struct ctdb_request *new_ctdb_request(struct ctdb_connection *ctdb, size_t len,
242                                       ctdb_callback_t cb, void *cbdata)
243 {
244         struct ctdb_request *req = malloc(sizeof(*req));
245         if (!req)
246                 return NULL;
247         req->io = new_io_elem(len);
248         if (!req->io) {
249                 free(req);
250                 return NULL;
251         }
252         req->ctdb = ctdb;
253         req->hdr.hdr = io_elem_data(req->io, NULL);
254         req->reply = NULL;
255         req->callback = cb;
256         req->priv_data = cbdata;
257         req->extra = NULL;
258         req->extra_destructor = NULL;
259         return req;
260 }
261
262 void ctdb_request_free(struct ctdb_request *req)
263 {
264         struct ctdb_connection *ctdb = req->ctdb;
265
266         if (req->next || req->prev) {
267                 DEBUG(ctdb, LOG_ALERT,
268                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
269                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
270                 ctdb_cancel(ctdb, req);
271                 return;
272         }
273         if (req->extra_destructor) {
274                 req->extra_destructor(ctdb, req);
275         }
276         if (req->reply) {
277                 free_io_elem(req->reply);
278         }
279         free_io_elem(req->io);
280         free(req);
281 }
282
283 /* Sanity-checking wrapper for reply. */
284 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_request *req,
285                                                  uint32_t callid)
286 {
287         size_t len;
288         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
289
290         /* Library user error if this isn't a reply to a call. */
291         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
292                 errno = EINVAL;
293                 DEBUG(req->ctdb, LOG_ALERT,
294                       "This was not a ctdbd call request: operation %u",
295                       req->hdr.hdr->operation);
296                 return NULL;
297         }
298
299         if (req->hdr.call->callid != callid) {
300                 errno = EINVAL;
301                 DEBUG(req->ctdb, LOG_ALERT,
302                       "This was not a ctdbd %u call request: %u",
303                       callid, req->hdr.call->callid);
304                 return NULL;
305         }
306
307         /* ctdbd or our error if this isn't a reply call. */
308         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
309                 errno = EIO;
310                 DEBUG(req->ctdb, LOG_CRIT,
311                       "Invalid ctdbd call reply: len %zu, operation %u",
312                       len, inhdr->hdr.operation);
313                 return NULL;
314         }
315
316         return inhdr;
317 }
318
319 /* Sanity-checking wrapper for reply. */
320 struct ctdb_reply_control *unpack_reply_control(struct ctdb_request *req,
321                                                 enum ctdb_controls control)
322 {
323         size_t len;
324         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
325
326         /* Library user error if this isn't a reply to a call. */
327         if (len < sizeof(*inhdr)) {
328                 errno = EINVAL;
329                 DEBUG(req->ctdb, LOG_ALERT,
330                       "Short ctdbd control reply: %zu bytes", len);
331                 return NULL;
332         }
333         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
334                 errno = EINVAL;
335                 DEBUG(req->ctdb, LOG_ALERT,
336                       "This was not a ctdbd control request: operation %u",
337                       req->hdr.hdr->operation);
338                 return NULL;
339         }
340
341         /* ... or if it was a different control from what we expected. */
342         if (req->hdr.control->opcode != control) {
343                 errno = EINVAL;
344                 DEBUG(req->ctdb, LOG_ALERT,
345                       "This was not an opcode %u ctdbd control request: %u",
346                       control, req->hdr.control->opcode);
347                 return NULL;
348         }
349
350         /* ctdbd or our error if this isn't a reply call. */
351         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
352                 errno = EIO;
353                 DEBUG(req->ctdb, LOG_CRIT,
354                       "Invalid ctdbd control reply: operation %u",
355                       inhdr->hdr.operation);
356                 return NULL;
357         }
358
359         return inhdr;
360 }
361
362 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
363 {
364         struct ctdb_req_header *hdr;
365         size_t len;
366         struct ctdb_request *i;
367
368         hdr = io_elem_data(in, &len);
369         /* FIXME: use len to check packet! */
370
371         if (hdr->operation == CTDB_REQ_MESSAGE) {
372                 deliver_message(ctdb, hdr);
373                 return;
374         }
375
376         for (i = ctdb->doneq; i; i = i->next) {
377                 if (i->hdr.hdr->reqid == hdr->reqid) {
378                         DLIST_REMOVE(ctdb->doneq, i);
379                         i->reply = in;
380                         i->callback(ctdb, i, i->priv_data);
381                         return;
382                 }
383         }
384         DEBUG(ctdb, LOG_WARNING,
385               "Unexpected ctdbd request reply: operation %u reqid %u",
386               hdr->operation, hdr->reqid);
387         free_io_elem(in);
388 }
389
390 /* Remove "harmless" errors. */
391 static ssize_t real_error(ssize_t ret)
392 {
393         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
394                 return 0;
395         return ret;
396 }
397
398 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
399 {
400         if (ctdb->broken) {
401                 return false;
402         }
403
404         if (holding_lock(ctdb)) {
405                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
406         }
407
408         if (revents & POLLOUT) {
409                 while (ctdb->outq) {
410                         if (real_error(write_io_elem(ctdb->fd,
411                                                      ctdb->outq->io)) < 0) {
412                                 DEBUG(ctdb, LOG_ERR,
413                                       "ctdb_service: error writing to ctdbd");
414                                 ctdb->broken = true;
415                                 return false;
416                         }
417                         if (io_elem_finished(ctdb->outq->io)) {
418                                 struct ctdb_request *done = ctdb->outq;
419                                 DLIST_REMOVE(ctdb->outq, done);
420                                 /* We add at the head: any dead ones
421                                  * sit and end. */
422                                 DLIST_ADD(ctdb->doneq, done);
423                         }
424                 }
425         }
426
427         while (revents & POLLIN) {
428                 int ret;
429                 int num_ready = 0;
430
431                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
432                         DEBUG(ctdb, LOG_ERR,
433                               "ctdb_service: ioctl(FIONREAD) %d", errno);
434                         ctdb->broken = true;
435                         return false;
436                 }
437                 if (num_ready == 0) {
438                         /* the descriptor has been closed or we have all our data */
439                         break;
440                 }
441
442
443                 if (!ctdb->in) {
444                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
445                         if (!ctdb->in) {
446                                 DEBUG(ctdb, LOG_ERR,
447                                       "ctdb_service: allocating readbuf");
448                                 ctdb->broken = true;
449                                 return false;
450                         }
451                 }
452
453                 ret = read_io_elem(ctdb->fd, ctdb->in);
454                 if (real_error(ret) < 0 || ret == 0) {
455                         /* They closed fd? */
456                         if (ret == 0)
457                                 errno = EBADF;
458                         DEBUG(ctdb, LOG_ERR,
459                               "ctdb_service: error reading from ctdbd");
460                         ctdb->broken = true;
461                         return false;
462                 } else if (ret < 0) {
463                         /* No progress, stop loop. */
464                         break;
465                 } else if (io_elem_finished(ctdb->in)) {
466                         io_elem_queue(ctdb, ctdb->in);
467                         ctdb->in = NULL;
468                 }
469         }
470
471
472         while (ctdb->inqueue != NULL) {
473                 struct io_elem *io = ctdb->inqueue;
474
475                 io_elem_dequeue(ctdb, io);
476                 handle_incoming(ctdb, io);
477         }
478
479         return true;
480 }
481
482 /* This is inefficient.  We could pull in idtree.c. */
483 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
484 {
485         struct ctdb_request *i;
486
487         for (i = ctdb->outq; i; i = i->next) {
488                 if (i->hdr.hdr->reqid == reqid) {
489                         return true;
490                 }
491         }
492         for (i = ctdb->doneq; i; i = i->next) {
493                 if (i->hdr.hdr->reqid == reqid) {
494                         return true;
495                 }
496         }
497         return false;
498 }
499
500 uint32_t new_reqid(struct ctdb_connection *ctdb)
501 {
502         while (reqid_used(ctdb, ctdb->next_id)) {
503                 ctdb->next_id++;
504         }
505         return ctdb->next_id++;
506 }
507
508 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
509                                               uint32_t opcode,
510                                               uint32_t destnode,
511                                               const void *extra_data,
512                                               size_t extra,
513                                               ctdb_callback_t callback,
514                                               void *cbdata)
515 {
516         struct ctdb_request *req;
517         struct ctdb_req_control *pkt;
518
519         req = new_ctdb_request(
520                 ctdb, offsetof(struct ctdb_req_control, data) + extra,
521                 callback, cbdata);
522         if (!req)
523                 return NULL;
524
525         io_elem_init_req_header(req->io,
526                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
527
528         pkt = req->hdr.control;
529         pkt->pad = 0;
530         pkt->opcode = opcode;
531         pkt->srvid = 0;
532         pkt->client_id = 0;
533         pkt->flags = 0;
534         pkt->datalen = extra;
535         memcpy(pkt->data, extra_data, extra);
536         DLIST_ADD(ctdb->outq, req);
537         return req;
538 }
539
540 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
541                           struct ctdb_request *req,
542                           void *unused)
543 {
544         ctdb_request_free(req);
545 }
546
547 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
548 {
549         if (!req->next && !req->prev) {
550                 DEBUG(ctdb, LOG_ALERT,
551                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
552                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
553                 ctdb_request_free(req);
554                 return;
555         }
556
557         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
558               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
559
560         /* FIXME: If it's not sent, we could just free it right now. */
561         req->callback = ctdb_cancel_callback;
562 }
563
564 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
565 {
566         cleanup_locks(ctdb, db);
567         tdb_close(db->tdb);
568         free(db);
569 }
570
571 static void destroy_req_db(struct ctdb_connection *ctdb,
572                            struct ctdb_request *req);
573 static void attachdb_done(struct ctdb_connection *ctdb,
574                           struct ctdb_request *req,
575                           void *_db);
576 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
577                                     struct ctdb_request *req,
578                                     void *_db);
579
580 struct ctdb_request *
581 ctdb_attachdb_send(struct ctdb_connection *ctdb,
582                    const char *name, bool persistent, uint32_t tdb_flags,
583                    ctdb_callback_t callback, void *private_data)
584 {
585         struct ctdb_request *req;
586         struct ctdb_db *db;
587         uint32_t opcode;
588
589         /* FIXME: Search if db already open. */
590         db = malloc(sizeof(*db));
591         if (!db) {
592                 return NULL;
593         }
594
595         if (persistent) {
596                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
597         } else {
598                 opcode = CTDB_CONTROL_DB_ATTACH;
599         }
600
601         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
602                                        strlen(name) + 1, attachdb_done, db);
603         if (!req) {
604                 DEBUG(ctdb, LOG_ERR,
605                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
606                 free(db);
607                 return NULL;
608         }
609
610         db->ctdb = ctdb;
611         db->tdb_flags = tdb_flags;
612         db->persistent = persistent;
613         db->callback = callback;
614         db->private_data = private_data;
615
616         req->extra_destructor = destroy_req_db;
617         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
618         req->extra = NULL;
619
620         /* Flags get overloaded into srvid. */
621         req->hdr.control->srvid = tdb_flags;
622         DEBUG(db->ctdb, LOG_DEBUG,
623               "ctdb_attachdb_send: DB_ATTACH request %p", req);
624         return req;
625 }
626
627 static void destroy_req_db(struct ctdb_connection *ctdb,
628                            struct ctdb_request *req)
629 {
630         /* Incomplete db is in priv_data. */
631         free(req->priv_data);
632         /* second request is chained off this one. */
633         if (req->extra) {
634                 ctdb_request_free(req->extra);
635         }
636 }
637
638 static void attachdb_done(struct ctdb_connection *ctdb,
639                           struct ctdb_request *req,
640                           void *_db)
641 {
642         struct ctdb_db *db = _db;
643         struct ctdb_request *req2;
644         struct ctdb_reply_control *reply;
645         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
646
647         if (db->persistent) {
648                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
649         }
650
651         reply = unpack_reply_control(req, control);
652         if (!reply || reply->status != 0) {
653                 if (reply) {
654                         DEBUG(ctdb, LOG_ERR,
655                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
656                               reply->status);
657                 }
658                 /* We failed.  Hand request to user and have them discover it
659                  * via ctdb_attachdb_recv. */
660                 db->callback(ctdb, req, db->private_data);
661                 return;
662         }
663         db->id = *(uint32_t *)reply->data;
664
665         /* Now we do another call, to get the dbpath. */
666         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
667                                         CTDB_CURRENT_NODE,
668                                         &db->id, sizeof(db->id),
669                                         attachdb_getdbpath_done, db);
670         if (!req2) {
671                 DEBUG(db->ctdb, LOG_ERR,
672                       "ctdb_attachdb_send(async): failed to allocate");
673                 db->callback(ctdb, req, db->private_data);
674                 return;
675         }
676         req->extra = req2;
677         req2->extra = req;
678         DEBUG(db->ctdb, LOG_DEBUG,
679               "ctdb_attachdb_send(async): created getdbpath request");
680 }
681
682 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
683                                     struct ctdb_request *req,
684                                     void *_db)
685 {
686         struct ctdb_db *db = _db;
687
688         /* Do callback on original request. */
689         db->callback(ctdb, req->extra, db->private_data);
690 }
691
692 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
693                                    struct ctdb_request *req)
694 {
695         struct ctdb_request *dbpath_req = req->extra;
696         struct ctdb_reply_control *reply;
697         struct ctdb_db *db = req->priv_data;
698         uint32_t tdb_flags = db->tdb_flags;
699         struct tdb_logging_context log;
700
701         /* Never sent the dbpath request?  We've failed. */
702         if (!dbpath_req) {
703                 /* FIXME: Save errno? */
704                 errno = EINVAL;
705                 return NULL;
706         }
707
708         reply = unpack_reply_control(dbpath_req, CTDB_CONTROL_GETDBPATH);
709         if (!reply) {
710                 return NULL;
711         }
712         if (reply->status != 0) {
713                 DEBUG(db->ctdb, LOG_ERR,
714                       "ctdb_attachdb_recv: reply status %i", reply->status);
715                 return NULL;
716         }
717
718         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
719         tdb_flags |= TDB_DISALLOW_NESTING;
720
721         log.log_fn = ctdb_tdb_log_bridge;
722         log.log_private = ctdb;
723         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
724                               &log, NULL);
725         if (db->tdb == NULL) {
726                 DEBUG(db->ctdb, LOG_ERR,
727                       "ctdb_attachdb_recv: failed to tdb_open %s",
728                       (char *)reply->data);
729                 return NULL;
730         }
731
732         /* Finally, separate the db from the request (see destroy_req_db). */
733         req->priv_data = NULL;
734         DEBUG(db->ctdb, LOG_DEBUG,
735               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
736         return db;
737 }
738
739 static unsigned long lock_magic(struct ctdb_lock *lock)
740 {
741         /* A non-zero magic specific to this structure. */
742         return ((unsigned long)lock->key.dptr
743                 ^ (((unsigned long)lock->key.dptr) << 16)
744                 ^ 0xBADC0FFEEBADC0DEULL)
745                 | 1;
746 }
747
748 /* This is only called on locks before they're held. */
749 static void free_lock(struct ctdb_lock *lock)
750 {
751         if (lock->held_magic) {
752                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
753                       "free_lock invalid lock %p", lock);
754         }
755         free(lock->hdr);
756         free(lock);
757 }
758
759
760 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
761 {
762         if (lock->held_magic != lock_magic(lock)) {
763                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
764                       "ctdb_release_lock invalid lock %p", lock);
765         } else if (lock->ctdb_db != ctdb_db) {
766                 errno = EBADF;
767                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
768                       "ctdb_release_lock: wrong ctdb_db.");
769         } else {
770                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
771                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
772                       "ctdb_release_lock %p", lock);
773                 remove_lock(lock->ctdb_db->ctdb, lock);
774         }
775         lock->held_magic = 0;
776         free_lock(lock);
777 }
778
779
780 /* We keep the lock if local node is the dmaster. */
781 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
782 {
783         struct ctdb_ltdb_header *hdr;
784
785         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
786                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
787                       "ctdb_readrecordlock_async: failed to chainlock");
788                 return NULL;
789         }
790
791         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
792         if (hdr && lock->readonly && (hdr->flags & CTDB_REC_RO_HAVE_READONLY) ) {
793                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
794                       "ctdb_readrecordlock_async: got local lock for ro");
795                 lock->held_magic = lock_magic(lock);
796                 lock->hdr = hdr;
797                 add_lock(lock->ctdb_db->ctdb, lock);
798                 return true;
799         }
800         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
801                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
802                       "ctdb_readrecordlock_async: got local lock");
803                 lock->held_magic = lock_magic(lock);
804                 lock->hdr = hdr;
805                 add_lock(lock->ctdb_db->ctdb, lock);
806                 return true;
807         }
808
809         /* we dont have the record locally,
810          * drop to writelock to force a migration
811          */
812         if (!hdr && lock->readonly) {
813                 lock->readonly = false;
814         }
815
816         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
817         free(hdr);
818         return NULL;
819 }
820
821 /* If they shutdown before we hand them the lock, we free it here. */
822 static void destroy_lock(struct ctdb_connection *ctdb,
823                          struct ctdb_request *req)
824 {
825         free_lock(req->extra);
826 }
827
828 static void readrecordlock_retry(struct ctdb_connection *ctdb,
829                                  struct ctdb_request *req, void *private)
830 {
831         struct ctdb_lock *lock = req->extra;
832         struct ctdb_reply_call *reply;
833         TDB_DATA data;
834
835         /* OK, we've received reply to fetch-with-header migration */
836         reply = unpack_reply_call(req, CTDB_FETCH_WITH_HEADER_FUNC);
837         if (!reply || reply->status != 0) {
838                 if (reply) {
839                         DEBUG(ctdb, LOG_ERR,
840                               "ctdb_readrecordlock_async(async):"
841                               " FETCH_WITH_HEADER_FUNC returned %i", reply->status);
842                 }
843                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
844                 ctdb_request_free(req); /* Also frees lock. */
845                 return;
846         }
847
848         /* Can we get lock now? */
849         if (try_readrecordlock(lock, &data)) {
850                 /* Now it's their responsibility to free lock & request! */
851                 req->extra_destructor = NULL;
852                 lock->callback(lock->ctdb_db, lock, data, private);
853                 ctdb_request_free(req);
854                 return;
855         }
856
857         /* Retransmit the same request again (we lost race). */
858         io_elem_reset(req->io);
859         DLIST_ADD(ctdb->outq, req);
860 }
861
862 static bool
863 ctdb_readrecordlock_internal(struct ctdb_db *ctdb_db, TDB_DATA key,
864                              bool readonly,
865                              ctdb_rrl_callback_t callback, void *cbdata)
866 {
867         struct ctdb_request *req;
868         struct ctdb_lock *lock;
869         TDB_DATA data;
870
871         if (holding_lock(ctdb_db->ctdb)) {
872                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
873                       "ctdb_readrecordlock_async: already holding lock");
874                 return false;
875         }
876
877         /* Setup lock */
878         lock = malloc(sizeof(*lock) + key.dsize);
879         if (!lock) {
880                 DEBUG(ctdb_db->ctdb, LOG_ERR,
881                       "ctdb_readrecordlock_async: lock allocation failed");
882                 return false;
883         }
884         lock->key.dptr = (void *)(lock + 1);
885         memcpy(lock->key.dptr, key.dptr, key.dsize);
886         lock->key.dsize = key.dsize;
887         lock->ctdb_db = ctdb_db;
888         lock->hdr = NULL;
889         lock->held_magic = 0;
890         lock->readonly = readonly;
891
892         /* Fast path. */
893         if (try_readrecordlock(lock, &data)) {
894                 callback(ctdb_db, lock, data, cbdata);
895                 return true;
896         }
897
898         /* Slow path: create request. */
899         req = new_ctdb_request(
900                 ctdb_db->ctdb,
901                 offsetof(struct ctdb_req_call, data) + key.dsize,
902                 readrecordlock_retry, cbdata);
903         if (!req) {
904                 DEBUG(ctdb_db->ctdb, LOG_ERR,
905                       "ctdb_readrecordlock_async: allocation failed");
906                 free_lock(lock);
907                 return NULL;
908         }
909         req->extra = lock;
910         req->extra_destructor = destroy_lock;
911         /* We store the original callback in the lock, and use our own. */
912         lock->callback = callback;
913
914         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
915                                 new_reqid(ctdb_db->ctdb));
916
917         if (lock->readonly) {
918                 req->hdr.call->flags = CTDB_WANT_READONLY;
919         } else {
920                 req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
921         }
922         req->hdr.call->db_id = ctdb_db->id;
923         req->hdr.call->callid = CTDB_FETCH_WITH_HEADER_FUNC;
924         req->hdr.call->hopcount = 0;
925         req->hdr.call->keylen = key.dsize;
926         req->hdr.call->calldatalen = 0;
927         memcpy(req->hdr.call->data, key.dptr, key.dsize);
928         DLIST_ADD(ctdb_db->ctdb->outq, req);
929         return true;
930 }
931
932 bool
933 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
934                           ctdb_rrl_callback_t callback, void *cbdata)
935 {
936         return ctdb_readrecordlock_internal(ctdb_db, key,
937                         false,
938                         callback, cbdata);
939 }
940
941 bool
942 ctdb_readonlyrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
943                           ctdb_rrl_callback_t callback, void *cbdata)
944 {
945         return ctdb_readrecordlock_internal(ctdb_db, key,
946                         true,
947                         callback, cbdata);
948 }
949
950 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
951                       struct ctdb_lock *lock, TDB_DATA data)
952 {
953         if (lock->readonly) {
954                 errno = EBADF;
955                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
956                       "ctdb_writerecord: Can not write, read-only record.");
957                 return false;
958         }
959
960         if (lock->ctdb_db != ctdb_db) {
961                 errno = EBADF;
962                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
963                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
964                 return false;
965         }
966
967         if (lock->held_magic != lock_magic(lock)) {
968                 errno = EBADF;
969                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
970                       "ctdb_writerecord: Can not write. Lock has been released.");
971                 return false;
972         }
973                 
974         if (ctdb_db->persistent) {
975                 errno = EINVAL;
976                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
977                       "ctdb_writerecord: cannot write to persistent db");
978                 return false;
979         }
980
981         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
982         case 0:
983                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
984                       "ctdb_writerecord: optimized away noop write.");
985                 /* fall thru */
986         case 1:
987                 return true;
988
989         default:
990                 switch (errno) {
991                 case ENOMEM:
992                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
993                               "ctdb_writerecord: out of memory.");
994                         break;
995                 case EINVAL:
996                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
997                               "ctdb_writerecord: record changed under lock?");
998                         break;
999                 default: /* TDB already logged. */
1000                         break;
1001                 }
1002                 return false;
1003         }
1004 }
1005
1006
1007 struct ctdb_traverse_state {
1008         struct ctdb_request *handle;
1009         struct ctdb_db *ctdb_db;
1010         uint64_t srvid;
1011
1012         ctdb_traverse_callback_t callback;
1013         void *cbdata;
1014 };
1015
1016 static void traverse_remhnd_cb(struct ctdb_connection *ctdb,
1017                         struct ctdb_request *req, void *private_data)
1018 {
1019         struct ctdb_traverse_state *state = private_data;
1020
1021         if (!ctdb_remove_message_handler_recv(ctdb, state->handle)) {
1022                 DEBUG(ctdb, LOG_ERR,
1023                                 "Failed to remove message handler for"
1024                                 " traverse.");
1025                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1026                                 TRAVERSE_STATUS_ERROR,
1027                                 tdb_null, tdb_null,
1028                                 state->cbdata);
1029         }
1030         ctdb_request_free(state->handle);
1031         state->handle = NULL;
1032         free(state);
1033 }
1034         
1035 static void msg_h(struct ctdb_connection *ctdb, uint64_t srvid,
1036            TDB_DATA data, void *private_data)
1037 {
1038         struct ctdb_traverse_state *state = private_data;
1039         struct ctdb_db *ctdb_db = state->ctdb_db;
1040         struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
1041         TDB_DATA key;
1042
1043         if (data.dsize < sizeof(uint32_t) ||
1044             d->length != data.dsize) {
1045                 DEBUG(ctdb, LOG_ERR,
1046                         "Bad data size %u in traverse_handler",
1047                         (unsigned)data.dsize);
1048                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1049                                 TRAVERSE_STATUS_ERROR,
1050                                 tdb_null, tdb_null,
1051                                 state->cbdata);
1052                 state->handle = ctdb_remove_message_handler_send(
1053                                 state->ctdb_db->ctdb, state->srvid,
1054                                 msg_h, state,
1055                                 traverse_remhnd_cb, state);
1056                 return;
1057         }
1058
1059         key.dsize = d->keylen;
1060         key.dptr  = &d->data[0];
1061         data.dsize = d->datalen;
1062         data.dptr = &d->data[d->keylen];
1063
1064         if (key.dsize == 0 && data.dsize == 0) {
1065                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1066                                 TRAVERSE_STATUS_FINISHED,
1067                                 tdb_null, tdb_null,
1068                                 state->cbdata);
1069                 state->handle = ctdb_remove_message_handler_send(
1070                                 state->ctdb_db->ctdb, state->srvid,
1071                                 msg_h, state,
1072                                 traverse_remhnd_cb, state);
1073                 return;
1074         }
1075
1076         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1077                 /* empty records are deleted records in ctdb */
1078                 return;
1079         }
1080
1081         data.dsize -= sizeof(struct ctdb_ltdb_header);
1082         data.dptr  += sizeof(struct ctdb_ltdb_header);
1083
1084         if (state->callback(ctdb, ctdb_db,
1085                         TRAVERSE_STATUS_RECORD,
1086                         key, data, state->cbdata) != 0) {
1087                 state->handle = ctdb_remove_message_handler_send(
1088                                 state->ctdb_db->ctdb, state->srvid,
1089                                 msg_h, state,
1090                                 traverse_remhnd_cb, state);
1091                 return;
1092         }
1093 }
1094
1095 static void traverse_start_cb(struct ctdb_connection *ctdb,
1096                         struct ctdb_request *req, void *private_data)
1097 {
1098         struct ctdb_traverse_state *state = private_data;
1099
1100         ctdb_request_free(state->handle);
1101         state->handle = NULL;
1102 }
1103
1104 static void traverse_msghnd_cb(struct ctdb_connection *ctdb,
1105                         struct ctdb_request *req, void *private_data)
1106 {
1107         struct ctdb_traverse_state *state = private_data;
1108         struct ctdb_db *ctdb_db = state->ctdb_db;
1109         struct ctdb_traverse_start t;
1110
1111         if (!ctdb_set_message_handler_recv(ctdb, state->handle)) {
1112                 DEBUG(ctdb, LOG_ERR,
1113                                 "Failed to register message handler for"
1114                                 " traverse.");
1115                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1116                                 TRAVERSE_STATUS_ERROR,
1117                                 tdb_null, tdb_null,
1118                                 state->cbdata);
1119                 ctdb_request_free(state->handle);
1120                 state->handle = NULL;
1121                 free(state);
1122                 return;
1123         }
1124         ctdb_request_free(state->handle);
1125         state->handle = NULL;
1126
1127         t.db_id = ctdb_db->id;
1128         t.srvid = state->srvid;
1129         t.reqid = 0;
1130
1131         state->handle = new_ctdb_control_request(ctdb,
1132                                 CTDB_CONTROL_TRAVERSE_START,
1133                                 CTDB_CURRENT_NODE,
1134                                 &t, sizeof(t),
1135                                 traverse_start_cb, state);
1136         if (state->handle == NULL) {
1137                 DEBUG(ctdb, LOG_ERR,
1138                                 "ctdb_traverse_async:"
1139                                 " failed to send traverse_start control");
1140                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1141                                 TRAVERSE_STATUS_ERROR,
1142                                 tdb_null, tdb_null,
1143                                 state->cbdata);
1144                 state->handle = ctdb_remove_message_handler_send(
1145                                 state->ctdb_db->ctdb, state->srvid,
1146                                 msg_h, state,
1147                                 traverse_remhnd_cb, state);
1148                 return;
1149         }
1150 }
1151
1152 bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
1153                          ctdb_traverse_callback_t callback, void *cbdata)
1154 {
1155         struct ctdb_connection *ctdb = ctdb_db->ctdb;
1156         struct ctdb_traverse_state *state;
1157         static uint32_t tid = 0;
1158
1159         state = malloc(sizeof(struct ctdb_traverse_state));
1160         if (state == NULL) {
1161                 DEBUG(ctdb, LOG_ERR,
1162                                 "ctdb_traverse_async: no memory."
1163                                 " allocate state failed");
1164                 return false;
1165         }
1166
1167         tid++;
1168         state->srvid = CTDB_SRVID_TRAVERSE_RANGE|tid;
1169
1170         state->callback = callback;
1171         state->cbdata   = cbdata;
1172         state->ctdb_db  = ctdb_db;
1173
1174         state->handle = ctdb_set_message_handler_send(ctdb_db->ctdb,
1175                                 state->srvid,
1176                                 msg_h, state,
1177                                 traverse_msghnd_cb, state);
1178         if (state->handle == NULL) {
1179                 DEBUG(ctdb, LOG_ERR,
1180                         "ctdb_traverse_async:"
1181                         " failed ctdb_set_message_handler_send");
1182                 free(state);
1183                 return false;
1184         }
1185
1186         return true;
1187 }
1188
1189 int ctdb_num_out_queue(struct ctdb_connection *ctdb)
1190 {
1191         struct ctdb_request *req;
1192         int i;
1193
1194         for (i = 0, req = ctdb->outq; req; req = req->next, i++)
1195                 ;
1196
1197         return i;
1198 }
1199
1200 int ctdb_num_in_flight(struct ctdb_connection *ctdb)
1201 {
1202         struct ctdb_request *req;
1203         int i;
1204
1205         for (i = 0, req = ctdb->doneq; req; req = req->next, i++)
1206                 ;
1207
1208         return i;
1209 }
1210
1211 int ctdb_num_active(struct ctdb_connection *ctdb)
1212 {
1213         return ctdb_num_out_queue(ctdb)
1214                  + ctdb_num_in_flight(ctdb);
1215 }
1216