TMP: add a ctdb snapshot of current ctdb master (git://git.samba.org/ctdb.git) to...
[obnox/samba/samba-obnox.git] / ctdb / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19 #include <ctdb.h>
20 #include <poll.h>
21 #include <errno.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <sys/socket.h>
26 #include <sys/un.h>
27 #include <sys/ioctl.h>
28 #include "libctdb_private.h"
29 #include "io_elem.h"
30 #include "local_tdb.h"
31 #include "messages.h"
32 #include <dlinklist.h>
33 #include <ctdb_protocol.h>
34
35 /* Remove type-safety macros. */
36 #undef ctdb_attachdb_send
37 #undef ctdb_readrecordlock_async
38 #undef ctdb_connect
39
40 struct ctdb_lock {
41         struct ctdb_lock *next, *prev;
42
43         struct ctdb_db *ctdb_db;
44         TDB_DATA key;
45
46         /* This will always be set by the time user sees this. */
47         unsigned long held_magic;
48         struct ctdb_ltdb_header *hdr;
49
50         /* For convenience, we stash original callback here. */
51         ctdb_rrl_callback_t callback;
52 };
53
54 struct ctdb_db {
55         struct ctdb_connection *ctdb;
56         bool persistent;
57         uint32_t tdb_flags;
58         uint32_t id;
59         struct tdb_context *tdb;
60
61         ctdb_callback_t callback;
62         void *private_data;
63 };
64
65 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
66 {
67         DLIST_REMOVE(ctdb->locks, lock);
68 }
69
70 /* FIXME: for thread safety, need tid info too. */
71 static bool holding_lock(struct ctdb_connection *ctdb)
72 {
73         /* For the moment, you can't ever hold more than 1 lock. */
74         return (ctdb->locks != NULL);
75 }
76
77 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
78 {
79         DLIST_ADD(ctdb->locks, lock);
80 }
81
82 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
83 {
84         struct ctdb_lock *i, *next;
85
86         for (i = ctdb->locks; i; i = next) {
87                 /* Grab next pointer, as release_lock will free i */
88                 next = i->next;
89                 if (i->ctdb_db == db) {
90                         ctdb_release_lock(db, i);
91                 }
92         }
93 }
94
95 /* FIXME: Could be in shared util code with rest of ctdb */
96 static void close_noerr(int fd)
97 {
98         int olderr = errno;
99         close(fd);
100         errno = olderr;
101 }
102
103 /* FIXME: Could be in shared util code with rest of ctdb */
104 static void free_noerr(void *p)
105 {
106         int olderr = errno;
107         free(p);
108         errno = olderr;
109 }
110
111 /* FIXME: Could be in shared util code with rest of ctdb */
112 static void set_nonblocking(int fd)
113 {
114         unsigned v;
115         v = fcntl(fd, F_GETFL, 0);
116         fcntl(fd, F_SETFL, v | O_NONBLOCK);
117 }
118
119 /* FIXME: Could be in shared util code with rest of ctdb */
120 static void set_close_on_exec(int fd)
121 {
122         unsigned v;
123         v = fcntl(fd, F_GETFD, 0);
124         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
125 }
126
127 static void set_pnn(struct ctdb_connection *ctdb,
128                     struct ctdb_request *req,
129                     void *unused)
130 {
131         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
132                 DEBUG(ctdb, LOG_CRIT,
133                       "ctdb_connect(async): failed to get pnn");
134                 ctdb->broken = true;
135         }
136         ctdb_request_free(req);
137 }
138
139 struct ctdb_connection *ctdb_connect(const char *addr,
140                                      ctdb_log_fn_t log_fn, void *log_priv)
141 {
142         struct ctdb_connection *ctdb;
143         struct sockaddr_un sun;
144
145         ctdb = malloc(sizeof(*ctdb));
146         if (!ctdb) {
147                 /* With no format string, we hope it doesn't use ap! */
148                 va_list ap;
149                 memset(&ap, 0, sizeof(ap));
150                 errno = ENOMEM;
151                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
152                 goto fail;
153         }
154         ctdb->pnn = -1;
155         ctdb->outq = NULL;
156         ctdb->doneq = NULL;
157         ctdb->in = NULL;
158         ctdb->inqueue = NULL;
159         ctdb->message_handlers = NULL;
160         ctdb->next_id = 0;
161         ctdb->broken = false;
162         ctdb->log = log_fn;
163         ctdb->log_priv = log_priv;
164         ctdb->locks = NULL;
165
166         memset(&sun, 0, sizeof(sun));
167         sun.sun_family = AF_UNIX;
168         if (!addr)
169                 addr = CTDB_PATH;
170         strncpy(sun.sun_path, addr, sizeof(sun.sun_path)-1);
171         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
172         if (ctdb->fd < 0)
173                 goto free_fail;
174
175         set_nonblocking(ctdb->fd);
176         set_close_on_exec(ctdb->fd);
177
178         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
179                 goto close_fail;
180
181         /* Immediately queue a request to get our pnn. */
182         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
183                 goto close_fail;
184
185         return ctdb;
186
187 close_fail:
188         close_noerr(ctdb->fd);
189 free_fail:
190         free_noerr(ctdb);
191 fail:
192         return NULL;
193 }
194
195 void ctdb_disconnect(struct ctdb_connection *ctdb)
196 {
197         struct ctdb_request *i;
198
199         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
200
201         while ((i = ctdb->outq) != NULL) {
202                 DLIST_REMOVE(ctdb->outq, i);
203                 ctdb_request_free(i);
204         }
205
206         while ((i = ctdb->doneq) != NULL) {
207                 DLIST_REMOVE(ctdb->doneq, i);
208                 ctdb_request_free(i);
209         }
210
211         if (ctdb->in)
212                 free_io_elem(ctdb->in);
213
214         remove_message_handlers(ctdb);
215
216         close(ctdb->fd);
217         /* Just in case they try to reuse */
218         ctdb->fd = -1;
219         free(ctdb);
220 }
221
222 int ctdb_get_fd(struct ctdb_connection *ctdb)
223 {
224         return ctdb->fd;
225 }
226
227 int ctdb_which_events(struct ctdb_connection *ctdb)
228 {
229         int events = POLLIN;
230
231         if (ctdb->outq)
232                 events |= POLLOUT;
233         return events;
234 }
235
236 struct ctdb_request *new_ctdb_request(struct ctdb_connection *ctdb, size_t len,
237                                       ctdb_callback_t cb, void *cbdata)
238 {
239         struct ctdb_request *req = malloc(sizeof(*req));
240         if (!req)
241                 return NULL;
242         req->io = new_io_elem(len);
243         if (!req->io) {
244                 free(req);
245                 return NULL;
246         }
247         req->ctdb = ctdb;
248         req->hdr.hdr = io_elem_data(req->io, NULL);
249         req->reply = NULL;
250         req->callback = cb;
251         req->priv_data = cbdata;
252         req->extra = NULL;
253         req->extra_destructor = NULL;
254         return req;
255 }
256
257 void ctdb_request_free(struct ctdb_request *req)
258 {
259         struct ctdb_connection *ctdb = req->ctdb;
260
261         if (req->next || req->prev) {
262                 DEBUG(ctdb, LOG_ALERT,
263                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
264                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
265                 ctdb_cancel(ctdb, req);
266                 return;
267         }
268         if (req->extra_destructor) {
269                 req->extra_destructor(ctdb, req);
270         }
271         if (req->reply) {
272                 free_io_elem(req->reply);
273         }
274         free_io_elem(req->io);
275         free(req);
276 }
277
278 /* Sanity-checking wrapper for reply. */
279 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_request *req,
280                                                  uint32_t callid)
281 {
282         size_t len;
283         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
284
285         /* Library user error if this isn't a reply to a call. */
286         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
287                 errno = EINVAL;
288                 DEBUG(req->ctdb, LOG_ALERT,
289                       "This was not a ctdbd call request: operation %u",
290                       req->hdr.hdr->operation);
291                 return NULL;
292         }
293
294         if (req->hdr.call->callid != callid) {
295                 errno = EINVAL;
296                 DEBUG(req->ctdb, LOG_ALERT,
297                       "This was not a ctdbd %u call request: %u",
298                       callid, req->hdr.call->callid);
299                 return NULL;
300         }
301
302         /* ctdbd or our error if this isn't a reply call. */
303         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
304                 errno = EIO;
305                 DEBUG(req->ctdb, LOG_CRIT,
306                       "Invalid ctdbd call reply: len %zu, operation %u",
307                       len, inhdr->hdr.operation);
308                 return NULL;
309         }
310
311         return inhdr;
312 }
313
314 /* Sanity-checking wrapper for reply. */
315 struct ctdb_reply_control *unpack_reply_control(struct ctdb_request *req,
316                                                 enum ctdb_controls control)
317 {
318         size_t len;
319         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
320
321         /* Library user error if this isn't a reply to a call. */
322         if (len < sizeof(*inhdr)) {
323                 errno = EINVAL;
324                 DEBUG(req->ctdb, LOG_ALERT,
325                       "Short ctdbd control reply: %zu bytes", len);
326                 return NULL;
327         }
328         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
329                 errno = EINVAL;
330                 DEBUG(req->ctdb, LOG_ALERT,
331                       "This was not a ctdbd control request: operation %u",
332                       req->hdr.hdr->operation);
333                 return NULL;
334         }
335
336         /* ... or if it was a different control from what we expected. */
337         if (req->hdr.control->opcode != control) {
338                 errno = EINVAL;
339                 DEBUG(req->ctdb, LOG_ALERT,
340                       "This was not an opcode %u ctdbd control request: %u",
341                       control, req->hdr.control->opcode);
342                 return NULL;
343         }
344
345         /* ctdbd or our error if this isn't a reply call. */
346         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
347                 errno = EIO;
348                 DEBUG(req->ctdb, LOG_CRIT,
349                       "Invalid ctdbd control reply: operation %u",
350                       inhdr->hdr.operation);
351                 return NULL;
352         }
353
354         return inhdr;
355 }
356
357 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
358 {
359         struct ctdb_req_header *hdr;
360         size_t len;
361         struct ctdb_request *i;
362
363         hdr = io_elem_data(in, &len);
364         /* FIXME: use len to check packet! */
365
366         if (hdr->operation == CTDB_REQ_MESSAGE) {
367                 deliver_message(ctdb, hdr);
368                 return;
369         }
370
371         for (i = ctdb->doneq; i; i = i->next) {
372                 if (i->hdr.hdr->reqid == hdr->reqid) {
373                         DLIST_REMOVE(ctdb->doneq, i);
374                         i->reply = in;
375                         i->callback(ctdb, i, i->priv_data);
376                         return;
377                 }
378         }
379         DEBUG(ctdb, LOG_WARNING,
380               "Unexpected ctdbd request reply: operation %u reqid %u",
381               hdr->operation, hdr->reqid);
382         free_io_elem(in);
383 }
384
385 /* Remove "harmless" errors. */
386 static ssize_t real_error(ssize_t ret)
387 {
388         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
389                 return 0;
390         return ret;
391 }
392
393 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
394 {
395         if (ctdb->broken) {
396                 return false;
397         }
398
399         if (holding_lock(ctdb)) {
400                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
401         }
402
403         if (revents & POLLOUT) {
404                 while (ctdb->outq) {
405                         if (real_error(write_io_elem(ctdb->fd,
406                                                      ctdb->outq->io)) < 0) {
407                                 DEBUG(ctdb, LOG_ERR,
408                                       "ctdb_service: error writing to ctdbd");
409                                 ctdb->broken = true;
410                                 return false;
411                         }
412                         if (io_elem_finished(ctdb->outq->io)) {
413                                 struct ctdb_request *done = ctdb->outq;
414                                 DLIST_REMOVE(ctdb->outq, done);
415                                 /* We add at the head: any dead ones
416                                  * sit and end. */
417                                 DLIST_ADD(ctdb->doneq, done);
418                         }
419                 }
420         }
421
422         while (revents & POLLIN) {
423                 int ret;
424                 int num_ready = 0;
425
426                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
427                         DEBUG(ctdb, LOG_ERR,
428                               "ctdb_service: ioctl(FIONREAD) %d", errno);
429                         ctdb->broken = true;
430                         return false;
431                 }
432                 if (num_ready == 0) {
433                         /* the descriptor has been closed or we have all our data */
434                         break;
435                 }
436
437
438                 if (!ctdb->in) {
439                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
440                         if (!ctdb->in) {
441                                 DEBUG(ctdb, LOG_ERR,
442                                       "ctdb_service: allocating readbuf");
443                                 ctdb->broken = true;
444                                 return false;
445                         }
446                 }
447
448                 ret = read_io_elem(ctdb->fd, ctdb->in);
449                 if (real_error(ret) < 0 || ret == 0) {
450                         /* They closed fd? */
451                         if (ret == 0)
452                                 errno = EBADF;
453                         DEBUG(ctdb, LOG_ERR,
454                               "ctdb_service: error reading from ctdbd");
455                         ctdb->broken = true;
456                         return false;
457                 } else if (ret < 0) {
458                         /* No progress, stop loop. */
459                         break;
460                 } else if (io_elem_finished(ctdb->in)) {
461                         io_elem_queue(ctdb, ctdb->in);
462                         ctdb->in = NULL;
463                 }
464         }
465
466
467         while (ctdb->inqueue != NULL) {
468                 struct io_elem *io = ctdb->inqueue;
469
470                 io_elem_dequeue(ctdb, io);
471                 handle_incoming(ctdb, io);
472         }
473
474         return true;
475 }
476
477 /* This is inefficient.  We could pull in idtree.c. */
478 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
479 {
480         struct ctdb_request *i;
481
482         for (i = ctdb->outq; i; i = i->next) {
483                 if (i->hdr.hdr->reqid == reqid) {
484                         return true;
485                 }
486         }
487         for (i = ctdb->doneq; i; i = i->next) {
488                 if (i->hdr.hdr->reqid == reqid) {
489                         return true;
490                 }
491         }
492         return false;
493 }
494
495 uint32_t new_reqid(struct ctdb_connection *ctdb)
496 {
497         while (reqid_used(ctdb, ctdb->next_id)) {
498                 ctdb->next_id++;
499         }
500         return ctdb->next_id++;
501 }
502
503 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
504                                               uint32_t opcode,
505                                               uint32_t destnode,
506                                               const void *extra_data,
507                                               size_t extra,
508                                               ctdb_callback_t callback,
509                                               void *cbdata)
510 {
511         struct ctdb_request *req;
512         struct ctdb_req_control *pkt;
513
514         req = new_ctdb_request(
515                 ctdb, offsetof(struct ctdb_req_control, data) + extra,
516                 callback, cbdata);
517         if (!req)
518                 return NULL;
519
520         io_elem_init_req_header(req->io,
521                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
522
523         pkt = req->hdr.control;
524         pkt->pad = 0;
525         pkt->opcode = opcode;
526         pkt->srvid = 0;
527         pkt->client_id = 0;
528         pkt->flags = 0;
529         pkt->datalen = extra;
530         memcpy(pkt->data, extra_data, extra);
531         DLIST_ADD(ctdb->outq, req);
532         return req;
533 }
534
535 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
536                           struct ctdb_request *req,
537                           void *unused)
538 {
539         ctdb_request_free(req);
540 }
541
542 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
543 {
544         if (!req->next && !req->prev) {
545                 DEBUG(ctdb, LOG_ALERT,
546                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
547                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
548                 ctdb_request_free(req);
549                 return;
550         }
551
552         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
553               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
554
555         /* FIXME: If it's not sent, we could just free it right now. */
556         req->callback = ctdb_cancel_callback;
557 }
558
559 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
560 {
561         cleanup_locks(ctdb, db);
562         tdb_close(db->tdb);
563         free(db);
564 }
565
566 static void destroy_req_db(struct ctdb_connection *ctdb,
567                            struct ctdb_request *req);
568 static void attachdb_done(struct ctdb_connection *ctdb,
569                           struct ctdb_request *req,
570                           void *_db);
571 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
572                                     struct ctdb_request *req,
573                                     void *_db);
574
575 struct ctdb_request *
576 ctdb_attachdb_send(struct ctdb_connection *ctdb,
577                    const char *name, bool persistent, uint32_t tdb_flags,
578                    ctdb_callback_t callback, void *private_data)
579 {
580         struct ctdb_request *req;
581         struct ctdb_db *db;
582         uint32_t opcode;
583
584         /* FIXME: Search if db already open. */
585         db = malloc(sizeof(*db));
586         if (!db) {
587                 return NULL;
588         }
589
590         if (persistent) {
591                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
592         } else {
593                 opcode = CTDB_CONTROL_DB_ATTACH;
594         }
595
596         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
597                                        strlen(name) + 1, attachdb_done, db);
598         if (!req) {
599                 DEBUG(ctdb, LOG_ERR,
600                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
601                 free(db);
602                 return NULL;
603         }
604
605         db->ctdb = ctdb;
606         db->tdb_flags = tdb_flags;
607         db->persistent = persistent;
608         db->callback = callback;
609         db->private_data = private_data;
610
611         req->extra_destructor = destroy_req_db;
612         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
613         req->extra = NULL;
614
615         /* Flags get overloaded into srvid. */
616         req->hdr.control->srvid = tdb_flags;
617         DEBUG(db->ctdb, LOG_DEBUG,
618               "ctdb_attachdb_send: DB_ATTACH request %p", req);
619         return req;
620 }
621
622 static void destroy_req_db(struct ctdb_connection *ctdb,
623                            struct ctdb_request *req)
624 {
625         /* Incomplete db is in priv_data. */
626         free(req->priv_data);
627         /* second request is chained off this one. */
628         if (req->extra) {
629                 ctdb_request_free(req->extra);
630         }
631 }
632
633 static void attachdb_done(struct ctdb_connection *ctdb,
634                           struct ctdb_request *req,
635                           void *_db)
636 {
637         struct ctdb_db *db = _db;
638         struct ctdb_request *req2;
639         struct ctdb_reply_control *reply;
640         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
641
642         if (db->persistent) {
643                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
644         }
645
646         reply = unpack_reply_control(req, control);
647         if (!reply || reply->status != 0) {
648                 if (reply) {
649                         DEBUG(ctdb, LOG_ERR,
650                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
651                               reply->status);
652                 }
653                 /* We failed.  Hand request to user and have them discover it
654                  * via ctdb_attachdb_recv. */
655                 db->callback(ctdb, req, db->private_data);
656                 return;
657         }
658         db->id = *(uint32_t *)reply->data;
659
660         /* Now we do another call, to get the dbpath. */
661         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
662                                         CTDB_CURRENT_NODE,
663                                         &db->id, sizeof(db->id),
664                                         attachdb_getdbpath_done, db);
665         if (!req2) {
666                 DEBUG(db->ctdb, LOG_ERR,
667                       "ctdb_attachdb_send(async): failed to allocate");
668                 db->callback(ctdb, req, db->private_data);
669                 return;
670         }
671         req->extra = req2;
672         req2->extra = req;
673         DEBUG(db->ctdb, LOG_DEBUG,
674               "ctdb_attachdb_send(async): created getdbpath request");
675 }
676
677 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
678                                     struct ctdb_request *req,
679                                     void *_db)
680 {
681         struct ctdb_db *db = _db;
682
683         /* Do callback on original request. */
684         db->callback(ctdb, req->extra, db->private_data);
685 }
686
687 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
688                                    struct ctdb_request *req)
689 {
690         struct ctdb_request *dbpath_req = req->extra;
691         struct ctdb_reply_control *reply;
692         struct ctdb_db *db = req->priv_data;
693         uint32_t tdb_flags = db->tdb_flags;
694         struct tdb_logging_context log;
695
696         /* Never sent the dbpath request?  We've failed. */
697         if (!dbpath_req) {
698                 /* FIXME: Save errno? */
699                 errno = EINVAL;
700                 return NULL;
701         }
702
703         reply = unpack_reply_control(dbpath_req, CTDB_CONTROL_GETDBPATH);
704         if (!reply) {
705                 return NULL;
706         }
707         if (reply->status != 0) {
708                 DEBUG(db->ctdb, LOG_ERR,
709                       "ctdb_attachdb_recv: reply status %i", reply->status);
710                 return NULL;
711         }
712
713         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
714         tdb_flags |= TDB_DISALLOW_NESTING;
715
716         log.log_fn = ctdb_tdb_log_bridge;
717         log.log_private = ctdb;
718         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
719                               &log, NULL);
720         if (db->tdb == NULL) {
721                 DEBUG(db->ctdb, LOG_ERR,
722                       "ctdb_attachdb_recv: failed to tdb_open %s",
723                       (char *)reply->data);
724                 return NULL;
725         }
726
727         /* Finally, separate the db from the request (see destroy_req_db). */
728         req->priv_data = NULL;
729         DEBUG(db->ctdb, LOG_DEBUG,
730               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
731         return db;
732 }
733
734 static unsigned long lock_magic(struct ctdb_lock *lock)
735 {
736         /* A non-zero magic specific to this structure. */
737         return ((unsigned long)lock->key.dptr
738                 ^ (((unsigned long)lock->key.dptr) << 16)
739                 ^ 0xBADC0FFEEBADC0DEULL)
740                 | 1;
741 }
742
743 /* This is only called on locks before they're held. */
744 static void free_lock(struct ctdb_lock *lock)
745 {
746         if (lock->held_magic) {
747                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
748                       "free_lock invalid lock %p", lock);
749         }
750         free(lock->hdr);
751         free(lock);
752 }
753
754
755 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
756 {
757         if (lock->held_magic != lock_magic(lock)) {
758                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
759                       "ctdb_release_lock invalid lock %p", lock);
760         } else if (lock->ctdb_db != ctdb_db) {
761                 errno = EBADF;
762                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
763                       "ctdb_release_lock: wrong ctdb_db.");
764         } else {
765                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
766                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
767                       "ctdb_release_lock %p", lock);
768                 remove_lock(lock->ctdb_db->ctdb, lock);
769         }
770         lock->held_magic = 0;
771         free_lock(lock);
772 }
773
774
775 /* We keep the lock if local node is the dmaster. */
776 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
777 {
778         struct ctdb_ltdb_header *hdr;
779
780         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
781                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
782                       "ctdb_readrecordlock_async: failed to chainlock");
783                 return NULL;
784         }
785
786         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
787         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
788                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
789                       "ctdb_readrecordlock_async: got local lock");
790                 lock->held_magic = lock_magic(lock);
791                 lock->hdr = hdr;
792                 add_lock(lock->ctdb_db->ctdb, lock);
793                 return true;
794         }
795
796         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
797         free(hdr);
798         return NULL;
799 }
800
801 /* If they shutdown before we hand them the lock, we free it here. */
802 static void destroy_lock(struct ctdb_connection *ctdb,
803                          struct ctdb_request *req)
804 {
805         free_lock(req->extra);
806 }
807
808 static void readrecordlock_retry(struct ctdb_connection *ctdb,
809                                  struct ctdb_request *req, void *private)
810 {
811         struct ctdb_lock *lock = req->extra;
812         struct ctdb_reply_call *reply;
813         TDB_DATA data;
814
815         /* OK, we've received reply to noop migration */
816         reply = unpack_reply_call(req, CTDB_NULL_FUNC);
817         if (!reply || reply->status != 0) {
818                 if (reply) {
819                         DEBUG(ctdb, LOG_ERR,
820                               "ctdb_readrecordlock_async(async):"
821                               " NULL_FUNC returned %i", reply->status);
822                 }
823                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
824                 ctdb_request_free(req); /* Also frees lock. */
825                 return;
826         }
827
828         /* Can we get lock now? */
829         if (try_readrecordlock(lock, &data)) {
830                 /* Now it's their responsibility to free lock & request! */
831                 req->extra_destructor = NULL;
832                 lock->callback(lock->ctdb_db, lock, data, private);
833                 ctdb_request_free(req);
834                 return;
835         }
836
837         /* Retransmit the same request again (we lost race). */
838         io_elem_reset(req->io);
839         DLIST_ADD(ctdb->outq, req);
840 }
841
842 bool
843 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
844                           ctdb_rrl_callback_t callback, void *cbdata)
845 {
846         struct ctdb_request *req;
847         struct ctdb_lock *lock;
848         TDB_DATA data;
849
850         if (holding_lock(ctdb_db->ctdb)) {
851                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
852                       "ctdb_readrecordlock_async: already holding lock");
853                 return false;
854         }
855
856         /* Setup lock */
857         lock = malloc(sizeof(*lock) + key.dsize);
858         if (!lock) {
859                 DEBUG(ctdb_db->ctdb, LOG_ERR,
860                       "ctdb_readrecordlock_async: lock allocation failed");
861                 return false;
862         }
863         lock->key.dptr = (void *)(lock + 1);
864         memcpy(lock->key.dptr, key.dptr, key.dsize);
865         lock->key.dsize = key.dsize;
866         lock->ctdb_db = ctdb_db;
867         lock->hdr = NULL;
868         lock->held_magic = 0;
869
870         /* Fast path. */
871         if (try_readrecordlock(lock, &data)) {
872                 callback(ctdb_db, lock, data, cbdata);
873                 return true;
874         }
875
876         /* Slow path: create request. */
877         req = new_ctdb_request(
878                 ctdb_db->ctdb,
879                 offsetof(struct ctdb_req_call, data) + key.dsize,
880                 readrecordlock_retry, cbdata);
881         if (!req) {
882                 DEBUG(ctdb_db->ctdb, LOG_ERR,
883                       "ctdb_readrecordlock_async: allocation failed");
884                 free_lock(lock);
885                 return NULL;
886         }
887         req->extra = lock;
888         req->extra_destructor = destroy_lock;
889         /* We store the original callback in the lock, and use our own. */
890         lock->callback = callback;
891
892         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
893                                 new_reqid(ctdb_db->ctdb));
894
895         req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
896         req->hdr.call->db_id = ctdb_db->id;
897         req->hdr.call->callid = CTDB_NULL_FUNC;
898         req->hdr.call->hopcount = 0;
899         req->hdr.call->keylen = key.dsize;
900         req->hdr.call->calldatalen = 0;
901         memcpy(req->hdr.call->data, key.dptr, key.dsize);
902         DLIST_ADD(ctdb_db->ctdb->outq, req);
903         return true;
904 }
905
906 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
907                       struct ctdb_lock *lock, TDB_DATA data)
908 {
909         if (lock->ctdb_db != ctdb_db) {
910                 errno = EBADF;
911                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
912                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
913                 return false;
914         }
915
916         if (lock->held_magic != lock_magic(lock)) {
917                 errno = EBADF;
918                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
919                       "ctdb_writerecord: Can not write. Lock has been released.");
920                 return false;
921         }
922                 
923         if (ctdb_db->persistent) {
924                 errno = EINVAL;
925                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
926                       "ctdb_writerecord: cannot write to persistent db");
927                 return false;
928         }
929
930         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
931         case 0:
932                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
933                       "ctdb_writerecord: optimized away noop write.");
934                 /* fall thru */
935         case 1:
936                 return true;
937
938         default:
939                 switch (errno) {
940                 case ENOMEM:
941                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
942                               "ctdb_writerecord: out of memory.");
943                         break;
944                 case EINVAL:
945                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
946                               "ctdb_writerecord: record changed under lock?");
947                         break;
948                 default: /* TDB already logged. */
949                         break;
950                 }
951                 return false;
952         }
953 }
954
955
956 struct ctdb_traverse_state {
957         struct ctdb_request *handle;
958         struct ctdb_db *ctdb_db;
959         uint64_t srvid;
960
961         ctdb_traverse_callback_t callback;
962         void *cbdata;
963 };
964
965 static void traverse_remhnd_cb(struct ctdb_connection *ctdb,
966                         struct ctdb_request *req, void *private_data)
967 {
968         struct ctdb_traverse_state *state = private_data;
969
970         if (!ctdb_remove_message_handler_recv(ctdb, state->handle)) {
971                 DEBUG(ctdb, LOG_ERR,
972                                 "Failed to remove message handler for"
973                                 " traverse.");
974                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
975                                 TRAVERSE_STATUS_ERROR,
976                                 tdb_null, tdb_null,
977                                 state->cbdata);
978         }
979         ctdb_request_free(state->handle);
980         state->handle = NULL;
981         free(state);
982 }
983         
984 static void msg_h(struct ctdb_connection *ctdb, uint64_t srvid,
985            TDB_DATA data, void *private_data)
986 {
987         struct ctdb_traverse_state *state = private_data;
988         struct ctdb_db *ctdb_db = state->ctdb_db;
989         struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
990         TDB_DATA key;
991
992         if (data.dsize < sizeof(uint32_t) ||
993             d->length != data.dsize) {
994                 DEBUG(ctdb, LOG_ERR,
995                         "Bad data size %u in traverse_handler",
996                         (unsigned)data.dsize);
997                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
998                                 TRAVERSE_STATUS_ERROR,
999                                 tdb_null, tdb_null,
1000                                 state->cbdata);
1001                 state->handle = ctdb_remove_message_handler_send(
1002                                 state->ctdb_db->ctdb, state->srvid,
1003                                 msg_h, state,
1004                                 traverse_remhnd_cb, state);
1005                 return;
1006         }
1007
1008         key.dsize = d->keylen;
1009         key.dptr  = &d->data[0];
1010         data.dsize = d->datalen;
1011         data.dptr = &d->data[d->keylen];
1012
1013         if (key.dsize == 0 && data.dsize == 0) {
1014                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1015                                 TRAVERSE_STATUS_FINISHED,
1016                                 tdb_null, tdb_null,
1017                                 state->cbdata);
1018                 state->handle = ctdb_remove_message_handler_send(
1019                                 state->ctdb_db->ctdb, state->srvid,
1020                                 msg_h, state,
1021                                 traverse_remhnd_cb, state);
1022                 return;
1023         }
1024
1025         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1026                 /* empty records are deleted records in ctdb */
1027                 return;
1028         }
1029
1030         data.dsize -= sizeof(struct ctdb_ltdb_header);
1031         data.dptr  += sizeof(struct ctdb_ltdb_header);
1032
1033         if (state->callback(ctdb, ctdb_db,
1034                         TRAVERSE_STATUS_RECORD,
1035                         key, data, state->cbdata) != 0) {
1036                 state->handle = ctdb_remove_message_handler_send(
1037                                 state->ctdb_db->ctdb, state->srvid,
1038                                 msg_h, state,
1039                                 traverse_remhnd_cb, state);
1040                 return;
1041         }
1042 }
1043
1044 static void traverse_start_cb(struct ctdb_connection *ctdb,
1045                         struct ctdb_request *req, void *private_data)
1046 {
1047         struct ctdb_traverse_state *state = private_data;
1048
1049         ctdb_request_free(state->handle);
1050         state->handle = NULL;
1051 }
1052
1053 static void traverse_msghnd_cb(struct ctdb_connection *ctdb,
1054                         struct ctdb_request *req, void *private_data)
1055 {
1056         struct ctdb_traverse_state *state = private_data;
1057         struct ctdb_db *ctdb_db = state->ctdb_db;
1058         struct ctdb_traverse_start t;
1059
1060         if (!ctdb_set_message_handler_recv(ctdb, state->handle)) {
1061                 DEBUG(ctdb, LOG_ERR,
1062                                 "Failed to register message handler for"
1063                                 " traverse.");
1064                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1065                                 TRAVERSE_STATUS_ERROR,
1066                                 tdb_null, tdb_null,
1067                                 state->cbdata);
1068                 ctdb_request_free(state->handle);
1069                 state->handle = NULL;
1070                 free(state);
1071                 return;
1072         }
1073         ctdb_request_free(state->handle);
1074         state->handle = NULL;
1075
1076         t.db_id = ctdb_db->id;
1077         t.srvid = state->srvid;
1078         t.reqid = 0;
1079
1080         state->handle = new_ctdb_control_request(ctdb,
1081                                 CTDB_CONTROL_TRAVERSE_START,
1082                                 CTDB_CURRENT_NODE,
1083                                 &t, sizeof(t),
1084                                 traverse_start_cb, state);
1085         if (state->handle == NULL) {
1086                 DEBUG(ctdb, LOG_ERR,
1087                                 "ctdb_traverse_async:"
1088                                 " failed to send traverse_start control");
1089                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1090                                 TRAVERSE_STATUS_ERROR,
1091                                 tdb_null, tdb_null,
1092                                 state->cbdata);
1093                 state->handle = ctdb_remove_message_handler_send(
1094                                 state->ctdb_db->ctdb, state->srvid,
1095                                 msg_h, state,
1096                                 traverse_remhnd_cb, state);
1097                 return;
1098         }
1099 }
1100
1101 bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
1102                          ctdb_traverse_callback_t callback, void *cbdata)
1103 {
1104         struct ctdb_connection *ctdb = ctdb_db->ctdb;
1105         struct ctdb_traverse_state *state;
1106         static uint32_t tid = 0;
1107
1108         state = malloc(sizeof(struct ctdb_traverse_state));
1109         if (state == NULL) {
1110                 DEBUG(ctdb, LOG_ERR,
1111                                 "ctdb_traverse_async: no memory."
1112                                 " allocate state failed");
1113                 return false;
1114         }
1115
1116         tid++;
1117         state->srvid = CTDB_SRVID_TRAVERSE_RANGE|tid;
1118
1119         state->callback = callback;
1120         state->cbdata   = cbdata;
1121         state->ctdb_db  = ctdb_db;
1122
1123         state->handle = ctdb_set_message_handler_send(ctdb_db->ctdb,
1124                                 state->srvid,
1125                                 msg_h, state,
1126                                 traverse_msghnd_cb, state);
1127         if (state->handle == NULL) {
1128                 DEBUG(ctdb, LOG_ERR,
1129                         "ctdb_traverse_async:"
1130                         " failed ctdb_set_message_handler_send");
1131                 free(state);
1132                 return false;
1133         }
1134
1135         return true;
1136 }
1137
1138 int ctdb_num_out_queue(struct ctdb_connection *ctdb)
1139 {
1140         struct ctdb_request *req;
1141         int i;
1142
1143         for (i = 0, req = ctdb->outq; req; req = req->next, i++)
1144                 ;
1145
1146         return i;
1147 }
1148
1149 int ctdb_num_in_flight(struct ctdb_connection *ctdb)
1150 {
1151         struct ctdb_request *req;
1152         int i;
1153
1154         for (i = 0, req = ctdb->doneq; req; req = req->next, i++)
1155                 ;
1156
1157         return i;
1158 }
1159
1160 int ctdb_num_active(struct ctdb_connection *ctdb)
1161 {
1162         return ctdb_num_out_queue(ctdb)
1163                  + ctdb_num_in_flight(ctdb);
1164 }
1165