264952485797a065fb3d689ef20dc6ed2face204
[sahlberg/ctdb.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19 #include <ctdb.h>
20 #include <poll.h>
21 #include <errno.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <sys/socket.h>
26 #include <sys/un.h>
27 #include <sys/ioctl.h>
28 #include "libctdb_private.h"
29 #include "io_elem.h"
30 #include "local_tdb.h"
31 #include "messages.h"
32 #include <dlinklist.h>
33 #include <ctdb_protocol.h>
34
35 /* Remove type-safety macros. */
36 #undef ctdb_attachdb_send
37 #undef ctdb_readrecordlock_async
38 #undef ctdb_connect
39
40 struct ctdb_lock {
41         struct ctdb_lock *next, *prev;
42
43         struct ctdb_db *ctdb_db;
44         TDB_DATA key;
45
46         /* This will always be set by the time user sees this. */
47         unsigned long held_magic;
48         struct ctdb_ltdb_header *hdr;
49
50         /* For convenience, we stash original callback here. */
51         ctdb_rrl_callback_t callback;
52 };
53
54 struct ctdb_db {
55         struct ctdb_connection *ctdb;
56         bool persistent;
57         uint32_t tdb_flags;
58         uint32_t id;
59         struct tdb_context *tdb;
60
61         ctdb_callback_t callback;
62         void *private_data;
63 };
64
65 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
66 {
67         DLIST_REMOVE(ctdb->locks, lock);
68 }
69
70 /* FIXME: for thread safety, need tid info too. */
71 static bool holding_lock(struct ctdb_connection *ctdb)
72 {
73         /* For the moment, you can't ever hold more than 1 lock. */
74         return (ctdb->locks != NULL);
75 }
76
77 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
78 {
79         DLIST_ADD(ctdb->locks, lock);
80 }
81
82 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
83 {
84         struct ctdb_lock *i, *next;
85
86         for (i = ctdb->locks; i; i = next) {
87                 /* Grab next pointer, as release_lock will free i */
88                 next = i->next;
89                 if (i->ctdb_db == db) {
90                         ctdb_release_lock(db, i);
91                 }
92         }
93 }
94
95 /* FIXME: Could be in shared util code with rest of ctdb */
96 static void close_noerr(int fd)
97 {
98         int olderr = errno;
99         close(fd);
100         errno = olderr;
101 }
102
103 /* FIXME: Could be in shared util code with rest of ctdb */
104 static void free_noerr(void *p)
105 {
106         int olderr = errno;
107         free(p);
108         errno = olderr;
109 }
110
111 /* FIXME: Could be in shared util code with rest of ctdb */
112 static void set_nonblocking(int fd)
113 {
114         unsigned v;
115         v = fcntl(fd, F_GETFL, 0);
116         fcntl(fd, F_SETFL, v | O_NONBLOCK);
117 }
118
119 /* FIXME: Could be in shared util code with rest of ctdb */
120 static void set_close_on_exec(int fd)
121 {
122         unsigned v;
123         v = fcntl(fd, F_GETFD, 0);
124         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
125 }
126
127 static void set_pnn(struct ctdb_connection *ctdb,
128                     struct ctdb_request *req,
129                     void *unused)
130 {
131         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
132                 DEBUG(ctdb, LOG_CRIT,
133                       "ctdb_connect(async): failed to get pnn");
134                 ctdb->broken = true;
135         }
136         ctdb_request_free(ctdb, req);
137 }
138
139 struct ctdb_connection *ctdb_connect(const char *addr,
140                                      ctdb_log_fn_t log_fn, void *log_priv)
141 {
142         struct ctdb_connection *ctdb;
143         struct sockaddr_un sun;
144
145         ctdb = malloc(sizeof(*ctdb));
146         if (!ctdb) {
147                 /* With no format string, we hope it doesn't use ap! */
148                 va_list ap;
149                 memset(&ap, 0, sizeof(ap));
150                 errno = ENOMEM;
151                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
152                 goto fail;
153         }
154         ctdb->outq = NULL;
155         ctdb->doneq = NULL;
156         ctdb->in = NULL;
157         ctdb->message_handlers = NULL;
158         ctdb->next_id = 0;
159         ctdb->broken = false;
160         ctdb->log = log_fn;
161         ctdb->log_priv = log_priv;
162         ctdb->locks = NULL;
163
164         memset(&sun, 0, sizeof(sun));
165         sun.sun_family = AF_UNIX;
166         if (!addr)
167                 addr = CTDB_PATH;
168         strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
169         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
170         if (ctdb->fd < 0)
171                 goto free_fail;
172
173         set_nonblocking(ctdb->fd);
174         set_close_on_exec(ctdb->fd);
175
176         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
177                 goto close_fail;
178
179         /* Immediately queue a request to get our pnn. */
180         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
181                 goto close_fail;
182
183         return ctdb;
184
185 close_fail:
186         close_noerr(ctdb->fd);
187 free_fail:
188         free_noerr(ctdb);
189 fail:
190         return NULL;
191 }
192
193 void ctdb_disconnect(struct ctdb_connection *ctdb)
194 {
195         struct ctdb_request *i;
196
197         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
198
199         while ((i = ctdb->outq) != NULL) {
200                 DLIST_REMOVE(ctdb->outq, i);
201                 ctdb_request_free(ctdb, i);
202         }
203
204         while ((i = ctdb->doneq) != NULL) {
205                 DLIST_REMOVE(ctdb->doneq, i);
206                 ctdb_request_free(ctdb, i);
207         }
208
209         if (ctdb->in)
210                 free_io_elem(ctdb->in);
211
212         remove_message_handlers(ctdb);
213
214         close(ctdb->fd);
215         /* Just in case they try to reuse */
216         ctdb->fd = -1;
217         free(ctdb);
218 }
219
220 int ctdb_get_fd(struct ctdb_connection *ctdb)
221 {
222         return ctdb->fd;
223 }
224
225 int ctdb_which_events(struct ctdb_connection *ctdb)
226 {
227         int events = POLLIN;
228
229         if (ctdb->outq)
230                 events |= POLLOUT;
231         return events;
232 }
233
234 struct ctdb_request *new_ctdb_request(size_t len,
235                                       ctdb_callback_t cb, void *cbdata)
236 {
237         struct ctdb_request *req = malloc(sizeof(*req));
238         if (!req)
239                 return NULL;
240         req->io = new_io_elem(len);
241         if (!req->io) {
242                 free(req);
243                 return NULL;
244         }
245         req->hdr.hdr = io_elem_data(req->io, NULL);
246         req->reply = NULL;
247         req->callback = cb;
248         req->priv_data = cbdata;
249         req->extra = NULL;
250         req->extra_destructor = NULL;
251         return req;
252 }
253
254 void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
255 {
256         if (req->next || req->prev) {
257                 DEBUG(ctdb, LOG_ALERT,
258                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
259                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
260                 ctdb_cancel(ctdb, req);
261                 return;
262         }
263         if (req->extra_destructor) {
264                 req->extra_destructor(ctdb, req);
265         }
266         if (req->reply) {
267                 free_io_elem(req->reply);
268         }
269         free_io_elem(req->io);
270         free(req);
271 }
272
273 /* Sanity-checking wrapper for reply. */
274 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
275                                                  struct ctdb_request *req,
276                                                  uint32_t callid)
277 {
278         size_t len;
279         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
280
281         /* Library user error if this isn't a reply to a call. */
282         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
283                 errno = EINVAL;
284                 DEBUG(ctdb, LOG_ALERT,
285                       "This was not a ctdbd call request: operation %u",
286                       req->hdr.hdr->operation);
287                 return NULL;
288         }
289
290         if (req->hdr.call->callid != callid) {
291                 errno = EINVAL;
292                 DEBUG(ctdb, LOG_ALERT,
293                       "This was not a ctdbd %u call request: %u",
294                       callid, req->hdr.call->callid);
295                 return NULL;
296         }
297
298         /* ctdbd or our error if this isn't a reply call. */
299         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
300                 errno = EIO;
301                 DEBUG(ctdb, LOG_CRIT,
302                       "Invalid ctdbd call reply: len %zu, operation %u",
303                       len, inhdr->hdr.operation);
304                 return NULL;
305         }
306
307         return inhdr;
308 }
309
310 /* Sanity-checking wrapper for reply. */
311 struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
312                                                 struct ctdb_request *req,
313                                                 enum ctdb_controls control)
314 {
315         size_t len;
316         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
317
318         /* Library user error if this isn't a reply to a call. */
319         if (len < sizeof(*inhdr)) {
320                 errno = EINVAL;
321                 DEBUG(ctdb, LOG_ALERT,
322                       "Short ctdbd control reply: %zu bytes", len);
323                 return NULL;
324         }
325         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
326                 errno = EINVAL;
327                 DEBUG(ctdb, LOG_ALERT,
328                       "This was not a ctdbd control request: operation %u",
329                       req->hdr.hdr->operation);
330                 return NULL;
331         }
332
333         /* ... or if it was a different control from what we expected. */
334         if (req->hdr.control->opcode != control) {
335                 errno = EINVAL;
336                 DEBUG(ctdb, LOG_ALERT,
337                       "This was not an opcode %u ctdbd control request: %u",
338                       control, req->hdr.control->opcode);
339                 return NULL;
340         }
341
342         /* ctdbd or our error if this isn't a reply call. */
343         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
344                 errno = EIO;
345                 DEBUG(ctdb, LOG_CRIT,
346                       "Invalid ctdbd control reply: operation %u",
347                       inhdr->hdr.operation);
348                 return NULL;
349         }
350
351         return inhdr;
352 }
353
354 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
355 {
356         struct ctdb_req_header *hdr;
357         size_t len;
358         struct ctdb_request *i;
359
360         hdr = io_elem_data(in, &len);
361         /* FIXME: use len to check packet! */
362
363         if (hdr->operation == CTDB_REQ_MESSAGE) {
364                 deliver_message(ctdb, hdr);
365                 return;
366         }
367
368         for (i = ctdb->doneq; i; i = i->next) {
369                 if (i->hdr.hdr->reqid == hdr->reqid) {
370                         DLIST_REMOVE(ctdb->doneq, i);
371                         i->reply = in;
372                         i->callback(ctdb, i, i->priv_data);
373                         return;
374                 }
375         }
376         DEBUG(ctdb, LOG_WARNING,
377               "Unexpected ctdbd request reply: operation %u reqid %u",
378               hdr->operation, hdr->reqid);
379         free_io_elem(in);
380 }
381
382 /* Remove "harmless" errors. */
383 static ssize_t real_error(ssize_t ret)
384 {
385         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
386                 return 0;
387         return ret;
388 }
389
390 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
391 {
392         if (ctdb->broken) {
393                 return false;
394         }
395
396         if (holding_lock(ctdb)) {
397                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
398         }
399
400         if (revents & POLLOUT) {
401                 while (ctdb->outq) {
402                         if (real_error(write_io_elem(ctdb->fd,
403                                                      ctdb->outq->io)) < 0) {
404                                 DEBUG(ctdb, LOG_ERR,
405                                       "ctdb_service: error writing to ctdbd");
406                                 ctdb->broken = true;
407                                 return false;
408                         }
409                         if (io_elem_finished(ctdb->outq->io)) {
410                                 struct ctdb_request *done = ctdb->outq;
411                                 DLIST_REMOVE(ctdb->outq, done);
412                                 /* We add at the head: any dead ones
413                                  * sit and end. */
414                                 DLIST_ADD(ctdb->doneq, done);
415                         }
416                 }
417         }
418
419         while (revents & POLLIN) {
420                 int ret;
421                 int num_ready = 0;
422
423                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
424                         DEBUG(ctdb, LOG_ERR,
425                               "ctdb_service: ioctl(FIONREAD) %d", errno);
426                         ctdb->broken = true;
427                         return false;
428                 }
429                 if (num_ready == 0) {
430                         /* the descriptor has been closed or we have all our data */
431                         break;
432                 }
433
434
435                 if (!ctdb->in) {
436                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
437                         if (!ctdb->in) {
438                                 DEBUG(ctdb, LOG_ERR,
439                                       "ctdb_service: allocating readbuf");
440                                 ctdb->broken = true;
441                                 return false;
442                         }
443                 }
444
445                 ret = read_io_elem(ctdb->fd, ctdb->in);
446                 if (real_error(ret) < 0 || ret == 0) {
447                         /* They closed fd? */
448                         if (ret == 0)
449                                 errno = EBADF;
450                         DEBUG(ctdb, LOG_ERR,
451                               "ctdb_service: error reading from ctdbd");
452                         ctdb->broken = true;
453                         return false;
454                 } else if (ret < 0) {
455                         /* No progress, stop loop. */
456                         break;
457                 } else if (io_elem_finished(ctdb->in)) {
458                         io_elem_queue(ctdb, ctdb->in);
459                         ctdb->in = NULL;
460                 }
461         }
462
463
464         while (ctdb->inqueue != NULL) {
465                 struct io_elem *io = ctdb->inqueue;
466
467                 io_elem_dequeue(ctdb, io);
468                 handle_incoming(ctdb, io);
469         }
470
471         return true;
472 }
473
474 /* This is inefficient.  We could pull in idtree.c. */
475 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
476 {
477         struct ctdb_request *i;
478
479         for (i = ctdb->outq; i; i = i->next) {
480                 if (i->hdr.hdr->reqid == reqid) {
481                         return true;
482                 }
483         }
484         for (i = ctdb->doneq; i; i = i->next) {
485                 if (i->hdr.hdr->reqid == reqid) {
486                         return true;
487                 }
488         }
489         return false;
490 }
491
492 uint32_t new_reqid(struct ctdb_connection *ctdb)
493 {
494         while (reqid_used(ctdb, ctdb->next_id)) {
495                 ctdb->next_id++;
496         }
497         return ctdb->next_id++;
498 }
499
500 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
501                                               uint32_t opcode,
502                                               uint32_t destnode,
503                                               const void *extra_data,
504                                               size_t extra,
505                                               ctdb_callback_t callback,
506                                               void *cbdata)
507 {
508         struct ctdb_request *req;
509         struct ctdb_req_control *pkt;
510
511         req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
512         if (!req)
513                 return NULL;
514
515         io_elem_init_req_header(req->io,
516                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
517
518         pkt = req->hdr.control;
519         pkt->pad = 0;
520         pkt->opcode = opcode;
521         pkt->srvid = 0;
522         pkt->client_id = 0;
523         pkt->flags = 0;
524         pkt->datalen = extra;
525         memcpy(pkt->data, extra_data, extra);
526         DLIST_ADD(ctdb->outq, req);
527         return req;
528 }
529
530 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
531                           struct ctdb_request *req,
532                           void *unused)
533 {
534         ctdb_request_free(ctdb, req);
535 }
536
537 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
538 {
539         if (!req->next && !req->prev) {
540                 DEBUG(ctdb, LOG_ALERT,
541                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
542                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
543                 ctdb_request_free(ctdb, req);
544                 return;
545         }
546
547         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
548               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
549
550         /* FIXME: If it's not sent, we could just free it right now. */
551         req->callback = ctdb_cancel_callback;
552 }
553
554 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
555 {
556         cleanup_locks(ctdb, db);
557         tdb_close(db->tdb);
558         free(db);
559 }
560
561 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
562                                     struct ctdb_request *req,
563                                     void *_db)
564 {
565         struct ctdb_db *db = _db;
566
567         /* Do callback on original request. */
568         db->callback(ctdb, req->extra, db->private_data);
569 }
570
571 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
572                                    struct ctdb_request *req)
573 {
574         struct ctdb_request *dbpath_req = req->extra;
575         struct ctdb_reply_control *reply;
576         struct ctdb_db *db = req->priv_data;
577         uint32_t tdb_flags = db->tdb_flags;
578         struct tdb_logging_context log;
579
580         /* Never sent the dbpath request?  We've failed. */
581         if (!dbpath_req) {
582                 /* FIXME: Save errno? */
583                 errno = EINVAL;
584                 return NULL;
585         }
586
587         reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
588         if (!reply) {
589                 return NULL;
590         }
591         if (reply->status != 0) {
592                 DEBUG(db->ctdb, LOG_ERR,
593                       "ctdb_attachdb_recv: reply status %i", reply->status);
594                 return NULL;
595         }
596
597         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
598         tdb_flags |= TDB_DISALLOW_NESTING;
599
600         log.log_fn = ctdb_tdb_log_bridge;
601         log.log_private = ctdb;
602         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
603                               &log, NULL);
604         if (db->tdb == NULL) {
605                 DEBUG(db->ctdb, LOG_ERR,
606                       "ctdb_attachdb_recv: failed to tdb_open %s",
607                       (char *)reply->data);
608                 return NULL;
609         }
610
611         /* Finally, separate the db from the request (see destroy_req_db). */
612         req->priv_data = NULL;
613         DEBUG(db->ctdb, LOG_DEBUG,
614               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
615         return db;
616 }
617
618 static void attachdb_done(struct ctdb_connection *ctdb,
619                           struct ctdb_request *req,
620                           void *_db)
621 {
622         struct ctdb_db *db = _db;
623         struct ctdb_request *req2;
624         struct ctdb_reply_control *reply;
625         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
626
627         if (db->persistent) {
628                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
629         }
630
631         reply = unpack_reply_control(ctdb, req, control);
632         if (!reply || reply->status != 0) {
633                 if (reply) {
634                         DEBUG(ctdb, LOG_ERR,
635                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
636                               reply->status);
637                 }
638                 /* We failed.  Hand request to user and have them discover it
639                  * via ctdb_attachdb_recv. */
640                 db->callback(ctdb, req, db->private_data);
641                 return;
642         }
643         db->id = *(uint32_t *)reply->data;
644
645         /* Now we do another call, to get the dbpath. */
646         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
647                                         CTDB_CURRENT_NODE,
648                                         &db->id, sizeof(db->id),
649                                         attachdb_getdbpath_done, db);
650         if (!req2) {
651                 DEBUG(db->ctdb, LOG_ERR,
652                       "ctdb_attachdb_send(async): failed to allocate");
653                 db->callback(ctdb, req, db->private_data);
654                 return;
655         }
656         req->extra = req2;
657         req2->extra = req;
658         DEBUG(db->ctdb, LOG_DEBUG,
659               "ctdb_attachdb_send(async): created getdbpath request");
660 }
661
662 static void destroy_req_db(struct ctdb_connection *ctdb,
663                            struct ctdb_request *req)
664 {
665         /* Incomplete db is in priv_data. */
666         free(req->priv_data);
667         /* second request is chained off this one. */
668         if (req->extra) {
669                 ctdb_request_free(ctdb, req->extra);
670         }
671 }
672
673 struct ctdb_request *
674 ctdb_attachdb_send(struct ctdb_connection *ctdb,
675                    const char *name, bool persistent, uint32_t tdb_flags,
676                    ctdb_callback_t callback, void *private_data)
677 {
678         struct ctdb_request *req;
679         struct ctdb_db *db;
680         uint32_t opcode;
681
682         /* FIXME: Search if db already open. */
683         db = malloc(sizeof(*db));
684         if (!db) {
685                 return NULL;
686         }
687
688         if (persistent) {
689                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
690         } else {
691                 opcode = CTDB_CONTROL_DB_ATTACH;
692         }
693
694         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
695                                        strlen(name) + 1, attachdb_done, db);
696         if (!req) {
697                 DEBUG(ctdb, LOG_ERR,
698                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
699                 free(db);
700                 return NULL;
701         }
702
703         db->ctdb = ctdb;
704         db->tdb_flags = tdb_flags;
705         db->persistent = persistent;
706         db->callback = callback;
707         db->private_data = private_data;
708
709         req->extra_destructor = destroy_req_db;
710         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
711         req->extra = NULL;
712
713         /* Flags get overloaded into srvid. */
714         req->hdr.control->srvid = tdb_flags;
715         DEBUG(db->ctdb, LOG_DEBUG,
716               "ctdb_attachdb_send: DB_ATTACH request %p", req);
717         return req;
718 }
719
720 static unsigned long lock_magic(struct ctdb_lock *lock)
721 {
722         /* A non-zero magic specific to this structure. */
723         return ((unsigned long)lock->key.dptr
724                 ^ (((unsigned long)lock->key.dptr) << 16)
725                 ^ 0xBADC0FFEEBADC0DEULL)
726                 | 1;
727 }
728
729 /* This is only called on locks before they're held. */
730 static void free_lock(struct ctdb_lock *lock)
731 {
732         if (lock->held_magic) {
733                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
734                       "free_lock invalid lock %p", lock);
735         }
736         free(lock->hdr);
737         free(lock);
738 }
739
740
741 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
742 {
743         if (lock->held_magic != lock_magic(lock)) {
744                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
745                       "ctdb_release_lock invalid lock %p", lock);
746         } else if (lock->ctdb_db != ctdb_db) {
747                 errno = EBADF;
748                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
749                       "ctdb_release_lock: wrong ctdb_db.");
750         } else {
751                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
752                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
753                       "ctdb_release_lock %p", lock);
754                 remove_lock(lock->ctdb_db->ctdb, lock);
755         }
756         lock->held_magic = 0;
757         free_lock(lock);
758 }
759
760
761 /* We keep the lock if local node is the dmaster. */
762 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
763 {
764         struct ctdb_ltdb_header *hdr;
765
766         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
767                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
768                       "ctdb_readrecordlock_async: failed to chainlock");
769                 return NULL;
770         }
771
772         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
773         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
774                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
775                       "ctdb_readrecordlock_async: got local lock");
776                 lock->held_magic = lock_magic(lock);
777                 lock->hdr = hdr;
778                 add_lock(lock->ctdb_db->ctdb, lock);
779                 return true;
780         }
781
782         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
783         free(hdr);
784         return NULL;
785 }
786
787 /* If they shutdown before we hand them the lock, we free it here. */
788 static void destroy_lock(struct ctdb_connection *ctdb,
789                          struct ctdb_request *req)
790 {
791         free_lock(req->extra);
792 }
793
794 static void readrecordlock_retry(struct ctdb_connection *ctdb,
795                                  struct ctdb_request *req, void *private)
796 {
797         struct ctdb_lock *lock = req->extra;
798         struct ctdb_reply_call *reply;
799         TDB_DATA data;
800
801         /* OK, we've received reply to noop migration */
802         reply = unpack_reply_call(ctdb, req, CTDB_NULL_FUNC);
803         if (!reply || reply->status != 0) {
804                 if (reply) {
805                         DEBUG(ctdb, LOG_ERR,
806                               "ctdb_readrecordlock_async(async):"
807                               " NULL_FUNC returned %i", reply->status);
808                 }
809                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
810                 ctdb_request_free(ctdb, req); /* Also frees lock. */
811                 return;
812         }
813
814         /* Can we get lock now? */
815         if (try_readrecordlock(lock, &data)) {
816                 /* Now it's their responsibility to free lock & request! */
817                 req->extra_destructor = NULL;
818                 lock->callback(lock->ctdb_db, lock, data, private);
819                 ctdb_request_free(ctdb, req);
820                 return;
821         }
822
823         /* Retransmit the same request again (we lost race). */
824         io_elem_reset(req->io);
825         DLIST_ADD(ctdb->outq, req);
826 }
827
828 bool
829 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
830                           ctdb_rrl_callback_t callback, void *cbdata)
831 {
832         struct ctdb_request *req;
833         struct ctdb_lock *lock;
834         TDB_DATA data;
835
836         if (holding_lock(ctdb_db->ctdb)) {
837                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
838                       "ctdb_readrecordlock_async: already holding lock");
839                 return false;
840         }
841
842         /* Setup lock */
843         lock = malloc(sizeof(*lock) + key.dsize);
844         if (!lock) {
845                 DEBUG(ctdb_db->ctdb, LOG_ERR,
846                       "ctdb_readrecordlock_async: lock allocation failed");
847                 return false;
848         }
849         lock->key.dptr = (void *)(lock + 1);
850         memcpy(lock->key.dptr, key.dptr, key.dsize);
851         lock->key.dsize = key.dsize;
852         lock->ctdb_db = ctdb_db;
853         lock->hdr = NULL;
854         lock->held_magic = 0;
855
856         /* Fast path. */
857         if (try_readrecordlock(lock, &data)) {
858                 callback(ctdb_db, lock, data, cbdata);
859                 return true;
860         }
861
862         /* Slow path: create request. */
863         req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
864                                + key.dsize, readrecordlock_retry, cbdata);
865         if (!req) {
866                 DEBUG(ctdb_db->ctdb, LOG_ERR,
867                       "ctdb_readrecordlock_async: allocation failed");
868                 free_lock(lock);
869                 return NULL;
870         }
871         req->extra = lock;
872         req->extra_destructor = destroy_lock;
873         /* We store the original callback in the lock, and use our own. */
874         lock->callback = callback;
875
876         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
877                                 new_reqid(ctdb_db->ctdb));
878
879         req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
880         req->hdr.call->db_id = ctdb_db->id;
881         req->hdr.call->callid = CTDB_NULL_FUNC;
882         req->hdr.call->hopcount = 0;
883         req->hdr.call->keylen = key.dsize;
884         req->hdr.call->calldatalen = 0;
885         memcpy(req->hdr.call->data, key.dptr, key.dsize);
886         DLIST_ADD(ctdb_db->ctdb->outq, req);
887         return true;
888 }
889
890 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
891                       struct ctdb_lock *lock, TDB_DATA data)
892 {
893         if (lock->ctdb_db != ctdb_db) {
894                 errno = EBADF;
895                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
896                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
897                 return false;
898         }
899
900         if (lock->held_magic != lock_magic(lock)) {
901                 errno = EBADF;
902                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
903                       "ctdb_writerecord: Can not write. Lock has been released.");
904                 return false;
905         }
906                 
907         if (ctdb_db->persistent) {
908                 errno = EINVAL;
909                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
910                       "ctdb_writerecord: cannot write to persistent db");
911                 return false;
912         }
913
914         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
915         case 0:
916                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
917                       "ctdb_writerecord: optimized away noop write.");
918                 /* fall thru */
919         case 1:
920                 return true;
921
922         default:
923                 switch (errno) {
924                 case ENOMEM:
925                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
926                               "ctdb_writerecord: out of memory.");
927                         break;
928                 case EINVAL:
929                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
930                               "ctdb_writerecord: record changed under lock?");
931                         break;
932                 default: /* TDB already logged. */
933                         break;
934                 }
935                 return false;
936         }
937 }