e06c66ca85497d3b1904ea89d77a31745721b1bd
[sahlberg/ctdb.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19 #include <ctdb.h>
20 #include <poll.h>
21 #include <errno.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <sys/socket.h>
26 #include <sys/un.h>
27 #include <sys/ioctl.h>
28 #include "libctdb_private.h"
29 #include "io_elem.h"
30 #include "local_tdb.h"
31 #include "messages.h"
32 #include <dlinklist.h>
33 #include <ctdb_protocol.h>
34
35 /* Remove type-safety macros. */
36 #undef ctdb_attachdb_send
37 #undef ctdb_readrecordlock_async
38 #undef ctdb_connect
39
40 struct ctdb_lock {
41         struct ctdb_lock *next, *prev;
42
43         struct ctdb_db *ctdb_db;
44         TDB_DATA key;
45
46         /* This will always be set by the time user sees this. */
47         unsigned long held_magic;
48         struct ctdb_ltdb_header *hdr;
49
50         /* For convenience, we stash original callback here. */
51         ctdb_rrl_callback_t callback;
52 };
53
54 struct ctdb_db {
55         struct ctdb_connection *ctdb;
56         bool persistent;
57         uint32_t tdb_flags;
58         uint32_t id;
59         struct tdb_context *tdb;
60
61         ctdb_callback_t callback;
62         void *private_data;
63 };
64
65 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
66 {
67         DLIST_REMOVE(ctdb->locks, lock);
68 }
69
70 /* FIXME: for thread safety, need tid info too. */
71 static bool holding_lock(struct ctdb_connection *ctdb)
72 {
73         /* For the moment, you can't ever hold more than 1 lock. */
74         return (ctdb->locks != NULL);
75 }
76
77 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
78 {
79         DLIST_ADD(ctdb->locks, lock);
80 }
81
82 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
83 {
84         struct ctdb_lock *i, *next;
85
86         for (i = ctdb->locks; i; i = next) {
87                 /* Grab next pointer, as release_lock will free i */
88                 next = i->next;
89                 if (i->ctdb_db == db) {
90                         ctdb_release_lock(db, i);
91                 }
92         }
93 }
94
95 /* FIXME: Could be in shared util code with rest of ctdb */
96 static void close_noerr(int fd)
97 {
98         int olderr = errno;
99         close(fd);
100         errno = olderr;
101 }
102
103 /* FIXME: Could be in shared util code with rest of ctdb */
104 static void free_noerr(void *p)
105 {
106         int olderr = errno;
107         free(p);
108         errno = olderr;
109 }
110
111 /* FIXME: Could be in shared util code with rest of ctdb */
112 static void set_nonblocking(int fd)
113 {
114         unsigned v;
115         v = fcntl(fd, F_GETFL, 0);
116         fcntl(fd, F_SETFL, v | O_NONBLOCK);
117 }
118
119 /* FIXME: Could be in shared util code with rest of ctdb */
120 static void set_close_on_exec(int fd)
121 {
122         unsigned v;
123         v = fcntl(fd, F_GETFD, 0);
124         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
125 }
126
127 static void set_pnn(struct ctdb_connection *ctdb,
128                     struct ctdb_request *req,
129                     void *unused)
130 {
131         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
132                 DEBUG(ctdb, LOG_CRIT,
133                       "ctdb_connect(async): failed to get pnn");
134                 ctdb->broken = true;
135         }
136         ctdb_request_free(ctdb, req);
137 }
138
139 struct ctdb_connection *ctdb_connect(const char *addr,
140                                      ctdb_log_fn_t log_fn, void *log_priv)
141 {
142         struct ctdb_connection *ctdb;
143         struct sockaddr_un sun;
144
145         ctdb = malloc(sizeof(*ctdb));
146         if (!ctdb) {
147                 /* With no format string, we hope it doesn't use ap! */
148                 va_list ap;
149                 memset(&ap, 0, sizeof(ap));
150                 errno = ENOMEM;
151                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
152                 goto fail;
153         }
154         ctdb->outq = NULL;
155         ctdb->doneq = NULL;
156         ctdb->in = NULL;
157         ctdb->inqueue = NULL;
158         ctdb->message_handlers = NULL;
159         ctdb->next_id = 0;
160         ctdb->broken = false;
161         ctdb->log = log_fn;
162         ctdb->log_priv = log_priv;
163         ctdb->locks = NULL;
164
165         memset(&sun, 0, sizeof(sun));
166         sun.sun_family = AF_UNIX;
167         if (!addr)
168                 addr = CTDB_PATH;
169         strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
170         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
171         if (ctdb->fd < 0)
172                 goto free_fail;
173
174         set_nonblocking(ctdb->fd);
175         set_close_on_exec(ctdb->fd);
176
177         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
178                 goto close_fail;
179
180         /* Immediately queue a request to get our pnn. */
181         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
182                 goto close_fail;
183
184         return ctdb;
185
186 close_fail:
187         close_noerr(ctdb->fd);
188 free_fail:
189         free_noerr(ctdb);
190 fail:
191         return NULL;
192 }
193
194 void ctdb_disconnect(struct ctdb_connection *ctdb)
195 {
196         struct ctdb_request *i;
197
198         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
199
200         while ((i = ctdb->outq) != NULL) {
201                 DLIST_REMOVE(ctdb->outq, i);
202                 ctdb_request_free(ctdb, i);
203         }
204
205         while ((i = ctdb->doneq) != NULL) {
206                 DLIST_REMOVE(ctdb->doneq, i);
207                 ctdb_request_free(ctdb, i);
208         }
209
210         if (ctdb->in)
211                 free_io_elem(ctdb->in);
212
213         remove_message_handlers(ctdb);
214
215         close(ctdb->fd);
216         /* Just in case they try to reuse */
217         ctdb->fd = -1;
218         free(ctdb);
219 }
220
221 int ctdb_get_fd(struct ctdb_connection *ctdb)
222 {
223         return ctdb->fd;
224 }
225
226 int ctdb_which_events(struct ctdb_connection *ctdb)
227 {
228         int events = POLLIN;
229
230         if (ctdb->outq)
231                 events |= POLLOUT;
232         return events;
233 }
234
235 struct ctdb_request *new_ctdb_request(size_t len,
236                                       ctdb_callback_t cb, void *cbdata)
237 {
238         struct ctdb_request *req = malloc(sizeof(*req));
239         if (!req)
240                 return NULL;
241         req->io = new_io_elem(len);
242         if (!req->io) {
243                 free(req);
244                 return NULL;
245         }
246         req->hdr.hdr = io_elem_data(req->io, NULL);
247         req->reply = NULL;
248         req->callback = cb;
249         req->priv_data = cbdata;
250         req->extra = NULL;
251         req->extra_destructor = NULL;
252         return req;
253 }
254
255 void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
256 {
257         if (req->next || req->prev) {
258                 DEBUG(ctdb, LOG_ALERT,
259                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
260                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
261                 ctdb_cancel(ctdb, req);
262                 return;
263         }
264         if (req->extra_destructor) {
265                 req->extra_destructor(ctdb, req);
266         }
267         if (req->reply) {
268                 free_io_elem(req->reply);
269         }
270         free_io_elem(req->io);
271         free(req);
272 }
273
274 /* Sanity-checking wrapper for reply. */
275 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
276                                                  struct ctdb_request *req,
277                                                  uint32_t callid)
278 {
279         size_t len;
280         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
281
282         /* Library user error if this isn't a reply to a call. */
283         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
284                 errno = EINVAL;
285                 DEBUG(ctdb, LOG_ALERT,
286                       "This was not a ctdbd call request: operation %u",
287                       req->hdr.hdr->operation);
288                 return NULL;
289         }
290
291         if (req->hdr.call->callid != callid) {
292                 errno = EINVAL;
293                 DEBUG(ctdb, LOG_ALERT,
294                       "This was not a ctdbd %u call request: %u",
295                       callid, req->hdr.call->callid);
296                 return NULL;
297         }
298
299         /* ctdbd or our error if this isn't a reply call. */
300         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
301                 errno = EIO;
302                 DEBUG(ctdb, LOG_CRIT,
303                       "Invalid ctdbd call reply: len %zu, operation %u",
304                       len, inhdr->hdr.operation);
305                 return NULL;
306         }
307
308         return inhdr;
309 }
310
311 /* Sanity-checking wrapper for reply. */
312 struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
313                                                 struct ctdb_request *req,
314                                                 enum ctdb_controls control)
315 {
316         size_t len;
317         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
318
319         /* Library user error if this isn't a reply to a call. */
320         if (len < sizeof(*inhdr)) {
321                 errno = EINVAL;
322                 DEBUG(ctdb, LOG_ALERT,
323                       "Short ctdbd control reply: %zu bytes", len);
324                 return NULL;
325         }
326         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
327                 errno = EINVAL;
328                 DEBUG(ctdb, LOG_ALERT,
329                       "This was not a ctdbd control request: operation %u",
330                       req->hdr.hdr->operation);
331                 return NULL;
332         }
333
334         /* ... or if it was a different control from what we expected. */
335         if (req->hdr.control->opcode != control) {
336                 errno = EINVAL;
337                 DEBUG(ctdb, LOG_ALERT,
338                       "This was not an opcode %u ctdbd control request: %u",
339                       control, req->hdr.control->opcode);
340                 return NULL;
341         }
342
343         /* ctdbd or our error if this isn't a reply call. */
344         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
345                 errno = EIO;
346                 DEBUG(ctdb, LOG_CRIT,
347                       "Invalid ctdbd control reply: operation %u",
348                       inhdr->hdr.operation);
349                 return NULL;
350         }
351
352         return inhdr;
353 }
354
355 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
356 {
357         struct ctdb_req_header *hdr;
358         size_t len;
359         struct ctdb_request *i;
360
361         hdr = io_elem_data(in, &len);
362         /* FIXME: use len to check packet! */
363
364         if (hdr->operation == CTDB_REQ_MESSAGE) {
365                 deliver_message(ctdb, hdr);
366                 return;
367         }
368
369         for (i = ctdb->doneq; i; i = i->next) {
370                 if (i->hdr.hdr->reqid == hdr->reqid) {
371                         DLIST_REMOVE(ctdb->doneq, i);
372                         i->reply = in;
373                         i->callback(ctdb, i, i->priv_data);
374                         return;
375                 }
376         }
377         DEBUG(ctdb, LOG_WARNING,
378               "Unexpected ctdbd request reply: operation %u reqid %u",
379               hdr->operation, hdr->reqid);
380         free_io_elem(in);
381 }
382
383 /* Remove "harmless" errors. */
384 static ssize_t real_error(ssize_t ret)
385 {
386         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
387                 return 0;
388         return ret;
389 }
390
391 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
392 {
393         if (ctdb->broken) {
394                 return false;
395         }
396
397         if (holding_lock(ctdb)) {
398                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
399         }
400
401         if (revents & POLLOUT) {
402                 while (ctdb->outq) {
403                         if (real_error(write_io_elem(ctdb->fd,
404                                                      ctdb->outq->io)) < 0) {
405                                 DEBUG(ctdb, LOG_ERR,
406                                       "ctdb_service: error writing to ctdbd");
407                                 ctdb->broken = true;
408                                 return false;
409                         }
410                         if (io_elem_finished(ctdb->outq->io)) {
411                                 struct ctdb_request *done = ctdb->outq;
412                                 DLIST_REMOVE(ctdb->outq, done);
413                                 /* We add at the head: any dead ones
414                                  * sit and end. */
415                                 DLIST_ADD(ctdb->doneq, done);
416                         }
417                 }
418         }
419
420         while (revents & POLLIN) {
421                 int ret;
422                 int num_ready = 0;
423
424                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
425                         DEBUG(ctdb, LOG_ERR,
426                               "ctdb_service: ioctl(FIONREAD) %d", errno);
427                         ctdb->broken = true;
428                         return false;
429                 }
430                 if (num_ready == 0) {
431                         /* the descriptor has been closed or we have all our data */
432                         break;
433                 }
434
435
436                 if (!ctdb->in) {
437                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
438                         if (!ctdb->in) {
439                                 DEBUG(ctdb, LOG_ERR,
440                                       "ctdb_service: allocating readbuf");
441                                 ctdb->broken = true;
442                                 return false;
443                         }
444                 }
445
446                 ret = read_io_elem(ctdb->fd, ctdb->in);
447                 if (real_error(ret) < 0 || ret == 0) {
448                         /* They closed fd? */
449                         if (ret == 0)
450                                 errno = EBADF;
451                         DEBUG(ctdb, LOG_ERR,
452                               "ctdb_service: error reading from ctdbd");
453                         ctdb->broken = true;
454                         return false;
455                 } else if (ret < 0) {
456                         /* No progress, stop loop. */
457                         break;
458                 } else if (io_elem_finished(ctdb->in)) {
459                         io_elem_queue(ctdb, ctdb->in);
460                         ctdb->in = NULL;
461                 }
462         }
463
464
465         while (ctdb->inqueue != NULL) {
466                 struct io_elem *io = ctdb->inqueue;
467
468                 io_elem_dequeue(ctdb, io);
469                 handle_incoming(ctdb, io);
470         }
471
472         return true;
473 }
474
475 /* This is inefficient.  We could pull in idtree.c. */
476 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
477 {
478         struct ctdb_request *i;
479
480         for (i = ctdb->outq; i; i = i->next) {
481                 if (i->hdr.hdr->reqid == reqid) {
482                         return true;
483                 }
484         }
485         for (i = ctdb->doneq; i; i = i->next) {
486                 if (i->hdr.hdr->reqid == reqid) {
487                         return true;
488                 }
489         }
490         return false;
491 }
492
493 uint32_t new_reqid(struct ctdb_connection *ctdb)
494 {
495         while (reqid_used(ctdb, ctdb->next_id)) {
496                 ctdb->next_id++;
497         }
498         return ctdb->next_id++;
499 }
500
501 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
502                                               uint32_t opcode,
503                                               uint32_t destnode,
504                                               const void *extra_data,
505                                               size_t extra,
506                                               ctdb_callback_t callback,
507                                               void *cbdata)
508 {
509         struct ctdb_request *req;
510         struct ctdb_req_control *pkt;
511
512         req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
513         if (!req)
514                 return NULL;
515
516         io_elem_init_req_header(req->io,
517                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
518
519         pkt = req->hdr.control;
520         pkt->pad = 0;
521         pkt->opcode = opcode;
522         pkt->srvid = 0;
523         pkt->client_id = 0;
524         pkt->flags = 0;
525         pkt->datalen = extra;
526         memcpy(pkt->data, extra_data, extra);
527         DLIST_ADD(ctdb->outq, req);
528         return req;
529 }
530
531 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
532                           struct ctdb_request *req,
533                           void *unused)
534 {
535         ctdb_request_free(ctdb, req);
536 }
537
538 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
539 {
540         if (!req->next && !req->prev) {
541                 DEBUG(ctdb, LOG_ALERT,
542                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
543                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
544                 ctdb_request_free(ctdb, req);
545                 return;
546         }
547
548         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
549               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
550
551         /* FIXME: If it's not sent, we could just free it right now. */
552         req->callback = ctdb_cancel_callback;
553 }
554
555 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
556 {
557         cleanup_locks(ctdb, db);
558         tdb_close(db->tdb);
559         free(db);
560 }
561
562 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
563                                     struct ctdb_request *req,
564                                     void *_db)
565 {
566         struct ctdb_db *db = _db;
567
568         /* Do callback on original request. */
569         db->callback(ctdb, req->extra, db->private_data);
570 }
571
572 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
573                                    struct ctdb_request *req)
574 {
575         struct ctdb_request *dbpath_req = req->extra;
576         struct ctdb_reply_control *reply;
577         struct ctdb_db *db = req->priv_data;
578         uint32_t tdb_flags = db->tdb_flags;
579         struct tdb_logging_context log;
580
581         /* Never sent the dbpath request?  We've failed. */
582         if (!dbpath_req) {
583                 /* FIXME: Save errno? */
584                 errno = EINVAL;
585                 return NULL;
586         }
587
588         reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
589         if (!reply) {
590                 return NULL;
591         }
592         if (reply->status != 0) {
593                 DEBUG(db->ctdb, LOG_ERR,
594                       "ctdb_attachdb_recv: reply status %i", reply->status);
595                 return NULL;
596         }
597
598         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
599         tdb_flags |= TDB_DISALLOW_NESTING;
600
601         log.log_fn = ctdb_tdb_log_bridge;
602         log.log_private = ctdb;
603         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
604                               &log, NULL);
605         if (db->tdb == NULL) {
606                 DEBUG(db->ctdb, LOG_ERR,
607                       "ctdb_attachdb_recv: failed to tdb_open %s",
608                       (char *)reply->data);
609                 return NULL;
610         }
611
612         /* Finally, separate the db from the request (see destroy_req_db). */
613         req->priv_data = NULL;
614         DEBUG(db->ctdb, LOG_DEBUG,
615               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
616         return db;
617 }
618
619 static void attachdb_done(struct ctdb_connection *ctdb,
620                           struct ctdb_request *req,
621                           void *_db)
622 {
623         struct ctdb_db *db = _db;
624         struct ctdb_request *req2;
625         struct ctdb_reply_control *reply;
626         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
627
628         if (db->persistent) {
629                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
630         }
631
632         reply = unpack_reply_control(ctdb, req, control);
633         if (!reply || reply->status != 0) {
634                 if (reply) {
635                         DEBUG(ctdb, LOG_ERR,
636                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
637                               reply->status);
638                 }
639                 /* We failed.  Hand request to user and have them discover it
640                  * via ctdb_attachdb_recv. */
641                 db->callback(ctdb, req, db->private_data);
642                 return;
643         }
644         db->id = *(uint32_t *)reply->data;
645
646         /* Now we do another call, to get the dbpath. */
647         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
648                                         CTDB_CURRENT_NODE,
649                                         &db->id, sizeof(db->id),
650                                         attachdb_getdbpath_done, db);
651         if (!req2) {
652                 DEBUG(db->ctdb, LOG_ERR,
653                       "ctdb_attachdb_send(async): failed to allocate");
654                 db->callback(ctdb, req, db->private_data);
655                 return;
656         }
657         req->extra = req2;
658         req2->extra = req;
659         DEBUG(db->ctdb, LOG_DEBUG,
660               "ctdb_attachdb_send(async): created getdbpath request");
661 }
662
663 static void destroy_req_db(struct ctdb_connection *ctdb,
664                            struct ctdb_request *req)
665 {
666         /* Incomplete db is in priv_data. */
667         free(req->priv_data);
668         /* second request is chained off this one. */
669         if (req->extra) {
670                 ctdb_request_free(ctdb, req->extra);
671         }
672 }
673
674 struct ctdb_request *
675 ctdb_attachdb_send(struct ctdb_connection *ctdb,
676                    const char *name, bool persistent, uint32_t tdb_flags,
677                    ctdb_callback_t callback, void *private_data)
678 {
679         struct ctdb_request *req;
680         struct ctdb_db *db;
681         uint32_t opcode;
682
683         /* FIXME: Search if db already open. */
684         db = malloc(sizeof(*db));
685         if (!db) {
686                 return NULL;
687         }
688
689         if (persistent) {
690                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
691         } else {
692                 opcode = CTDB_CONTROL_DB_ATTACH;
693         }
694
695         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
696                                        strlen(name) + 1, attachdb_done, db);
697         if (!req) {
698                 DEBUG(ctdb, LOG_ERR,
699                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
700                 free(db);
701                 return NULL;
702         }
703
704         db->ctdb = ctdb;
705         db->tdb_flags = tdb_flags;
706         db->persistent = persistent;
707         db->callback = callback;
708         db->private_data = private_data;
709
710         req->extra_destructor = destroy_req_db;
711         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
712         req->extra = NULL;
713
714         /* Flags get overloaded into srvid. */
715         req->hdr.control->srvid = tdb_flags;
716         DEBUG(db->ctdb, LOG_DEBUG,
717               "ctdb_attachdb_send: DB_ATTACH request %p", req);
718         return req;
719 }
720
721 static unsigned long lock_magic(struct ctdb_lock *lock)
722 {
723         /* A non-zero magic specific to this structure. */
724         return ((unsigned long)lock->key.dptr
725                 ^ (((unsigned long)lock->key.dptr) << 16)
726                 ^ 0xBADC0FFEEBADC0DEULL)
727                 | 1;
728 }
729
730 /* This is only called on locks before they're held. */
731 static void free_lock(struct ctdb_lock *lock)
732 {
733         if (lock->held_magic) {
734                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
735                       "free_lock invalid lock %p", lock);
736         }
737         free(lock->hdr);
738         free(lock);
739 }
740
741
742 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
743 {
744         if (lock->held_magic != lock_magic(lock)) {
745                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
746                       "ctdb_release_lock invalid lock %p", lock);
747         } else if (lock->ctdb_db != ctdb_db) {
748                 errno = EBADF;
749                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
750                       "ctdb_release_lock: wrong ctdb_db.");
751         } else {
752                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
753                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
754                       "ctdb_release_lock %p", lock);
755                 remove_lock(lock->ctdb_db->ctdb, lock);
756         }
757         lock->held_magic = 0;
758         free_lock(lock);
759 }
760
761
762 /* We keep the lock if local node is the dmaster. */
763 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
764 {
765         struct ctdb_ltdb_header *hdr;
766
767         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
768                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
769                       "ctdb_readrecordlock_async: failed to chainlock");
770                 return NULL;
771         }
772
773         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
774         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
775                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
776                       "ctdb_readrecordlock_async: got local lock");
777                 lock->held_magic = lock_magic(lock);
778                 lock->hdr = hdr;
779                 add_lock(lock->ctdb_db->ctdb, lock);
780                 return true;
781         }
782
783         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
784         free(hdr);
785         return NULL;
786 }
787
788 /* If they shutdown before we hand them the lock, we free it here. */
789 static void destroy_lock(struct ctdb_connection *ctdb,
790                          struct ctdb_request *req)
791 {
792         free_lock(req->extra);
793 }
794
795 static void readrecordlock_retry(struct ctdb_connection *ctdb,
796                                  struct ctdb_request *req, void *private)
797 {
798         struct ctdb_lock *lock = req->extra;
799         struct ctdb_reply_call *reply;
800         TDB_DATA data;
801
802         /* OK, we've received reply to noop migration */
803         reply = unpack_reply_call(ctdb, req, CTDB_NULL_FUNC);
804         if (!reply || reply->status != 0) {
805                 if (reply) {
806                         DEBUG(ctdb, LOG_ERR,
807                               "ctdb_readrecordlock_async(async):"
808                               " NULL_FUNC returned %i", reply->status);
809                 }
810                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
811                 ctdb_request_free(ctdb, req); /* Also frees lock. */
812                 return;
813         }
814
815         /* Can we get lock now? */
816         if (try_readrecordlock(lock, &data)) {
817                 /* Now it's their responsibility to free lock & request! */
818                 req->extra_destructor = NULL;
819                 lock->callback(lock->ctdb_db, lock, data, private);
820                 ctdb_request_free(ctdb, req);
821                 return;
822         }
823
824         /* Retransmit the same request again (we lost race). */
825         io_elem_reset(req->io);
826         DLIST_ADD(ctdb->outq, req);
827 }
828
829 bool
830 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
831                           ctdb_rrl_callback_t callback, void *cbdata)
832 {
833         struct ctdb_request *req;
834         struct ctdb_lock *lock;
835         TDB_DATA data;
836
837         if (holding_lock(ctdb_db->ctdb)) {
838                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
839                       "ctdb_readrecordlock_async: already holding lock");
840                 return false;
841         }
842
843         /* Setup lock */
844         lock = malloc(sizeof(*lock) + key.dsize);
845         if (!lock) {
846                 DEBUG(ctdb_db->ctdb, LOG_ERR,
847                       "ctdb_readrecordlock_async: lock allocation failed");
848                 return false;
849         }
850         lock->key.dptr = (void *)(lock + 1);
851         memcpy(lock->key.dptr, key.dptr, key.dsize);
852         lock->key.dsize = key.dsize;
853         lock->ctdb_db = ctdb_db;
854         lock->hdr = NULL;
855         lock->held_magic = 0;
856
857         /* Fast path. */
858         if (try_readrecordlock(lock, &data)) {
859                 callback(ctdb_db, lock, data, cbdata);
860                 return true;
861         }
862
863         /* Slow path: create request. */
864         req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
865                                + key.dsize, readrecordlock_retry, cbdata);
866         if (!req) {
867                 DEBUG(ctdb_db->ctdb, LOG_ERR,
868                       "ctdb_readrecordlock_async: allocation failed");
869                 free_lock(lock);
870                 return NULL;
871         }
872         req->extra = lock;
873         req->extra_destructor = destroy_lock;
874         /* We store the original callback in the lock, and use our own. */
875         lock->callback = callback;
876
877         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
878                                 new_reqid(ctdb_db->ctdb));
879
880         req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
881         req->hdr.call->db_id = ctdb_db->id;
882         req->hdr.call->callid = CTDB_NULL_FUNC;
883         req->hdr.call->hopcount = 0;
884         req->hdr.call->keylen = key.dsize;
885         req->hdr.call->calldatalen = 0;
886         memcpy(req->hdr.call->data, key.dptr, key.dsize);
887         DLIST_ADD(ctdb_db->ctdb->outq, req);
888         return true;
889 }
890
891 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
892                       struct ctdb_lock *lock, TDB_DATA data)
893 {
894         if (lock->ctdb_db != ctdb_db) {
895                 errno = EBADF;
896                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
897                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
898                 return false;
899         }
900
901         if (lock->held_magic != lock_magic(lock)) {
902                 errno = EBADF;
903                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
904                       "ctdb_writerecord: Can not write. Lock has been released.");
905                 return false;
906         }
907                 
908         if (ctdb_db->persistent) {
909                 errno = EINVAL;
910                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
911                       "ctdb_writerecord: cannot write to persistent db");
912                 return false;
913         }
914
915         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
916         case 0:
917                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
918                       "ctdb_writerecord: optimized away noop write.");
919                 /* fall thru */
920         case 1:
921                 return true;
922
923         default:
924                 switch (errno) {
925                 case ENOMEM:
926                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
927                               "ctdb_writerecord: out of memory.");
928                         break;
929                 case EINVAL:
930                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
931                               "ctdb_writerecord: record changed under lock?");
932                         break;
933                 default: /* TDB already logged. */
934                         break;
935                 }
936                 return false;
937         }
938 }