libctdb: dont allow ctdb_writerecord() for readonly records
[ctdb.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5    Copyright (C) Ronnie Sahlberg 2011
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include <ctdb.h>
21 #include <poll.h>
22 #include <errno.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <stdlib.h>
26 #include <sys/socket.h>
27 #include <sys/un.h>
28 #include <sys/ioctl.h>
29 #include "libctdb_private.h"
30 #include "io_elem.h"
31 #include "local_tdb.h"
32 #include "messages.h"
33 #include <dlinklist.h>
34 #include <ctdb_protocol.h>
35
36 /* Remove type-safety macros. */
37 #undef ctdb_attachdb_send
38 #undef ctdb_readrecordlock_async
39 #undef ctdb_readonlyrecordlock_async
40 #undef ctdb_connect
41
42 struct ctdb_lock {
43         struct ctdb_lock *next, *prev;
44
45         struct ctdb_db *ctdb_db;
46         TDB_DATA key;
47
48         /* Is this a request for read-only lock ? */
49         bool readonly;
50
51         /* This will always be set by the time user sees this. */
52         unsigned long held_magic;
53         struct ctdb_ltdb_header *hdr;
54
55         /* For convenience, we stash original callback here. */
56         ctdb_rrl_callback_t callback;
57 };
58
59 struct ctdb_db {
60         struct ctdb_connection *ctdb;
61         bool persistent;
62         uint32_t tdb_flags;
63         uint32_t id;
64         struct tdb_context *tdb;
65
66         ctdb_callback_t callback;
67         void *private_data;
68 };
69
70 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
71 {
72         DLIST_REMOVE(ctdb->locks, lock);
73 }
74
75 /* FIXME: for thread safety, need tid info too. */
76 static bool holding_lock(struct ctdb_connection *ctdb)
77 {
78         /* For the moment, you can't ever hold more than 1 lock. */
79         return (ctdb->locks != NULL);
80 }
81
82 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
83 {
84         DLIST_ADD(ctdb->locks, lock);
85 }
86
87 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
88 {
89         struct ctdb_lock *i, *next;
90
91         for (i = ctdb->locks; i; i = next) {
92                 /* Grab next pointer, as release_lock will free i */
93                 next = i->next;
94                 if (i->ctdb_db == db) {
95                         ctdb_release_lock(db, i);
96                 }
97         }
98 }
99
100 /* FIXME: Could be in shared util code with rest of ctdb */
101 static void close_noerr(int fd)
102 {
103         int olderr = errno;
104         close(fd);
105         errno = olderr;
106 }
107
108 /* FIXME: Could be in shared util code with rest of ctdb */
109 static void free_noerr(void *p)
110 {
111         int olderr = errno;
112         free(p);
113         errno = olderr;
114 }
115
116 /* FIXME: Could be in shared util code with rest of ctdb */
117 static void set_nonblocking(int fd)
118 {
119         unsigned v;
120         v = fcntl(fd, F_GETFL, 0);
121         fcntl(fd, F_SETFL, v | O_NONBLOCK);
122 }
123
124 /* FIXME: Could be in shared util code with rest of ctdb */
125 static void set_close_on_exec(int fd)
126 {
127         unsigned v;
128         v = fcntl(fd, F_GETFD, 0);
129         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
130 }
131
132 static void set_pnn(struct ctdb_connection *ctdb,
133                     struct ctdb_request *req,
134                     void *unused)
135 {
136         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
137                 DEBUG(ctdb, LOG_CRIT,
138                       "ctdb_connect(async): failed to get pnn");
139                 ctdb->broken = true;
140         }
141         ctdb_request_free(ctdb, req);
142 }
143
144 struct ctdb_connection *ctdb_connect(const char *addr,
145                                      ctdb_log_fn_t log_fn, void *log_priv)
146 {
147         struct ctdb_connection *ctdb;
148         struct sockaddr_un sun;
149
150         ctdb = malloc(sizeof(*ctdb));
151         if (!ctdb) {
152                 /* With no format string, we hope it doesn't use ap! */
153                 va_list ap;
154                 memset(&ap, 0, sizeof(ap));
155                 errno = ENOMEM;
156                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
157                 goto fail;
158         }
159         ctdb->outq = NULL;
160         ctdb->doneq = NULL;
161         ctdb->in = NULL;
162         ctdb->inqueue = NULL;
163         ctdb->message_handlers = NULL;
164         ctdb->next_id = 0;
165         ctdb->broken = false;
166         ctdb->log = log_fn;
167         ctdb->log_priv = log_priv;
168         ctdb->locks = NULL;
169
170         memset(&sun, 0, sizeof(sun));
171         sun.sun_family = AF_UNIX;
172         if (!addr)
173                 addr = CTDB_PATH;
174         strncpy(sun.sun_path, addr, sizeof(sun.sun_path));
175         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
176         if (ctdb->fd < 0)
177                 goto free_fail;
178
179         set_nonblocking(ctdb->fd);
180         set_close_on_exec(ctdb->fd);
181
182         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
183                 goto close_fail;
184
185         /* Immediately queue a request to get our pnn. */
186         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
187                 goto close_fail;
188
189         return ctdb;
190
191 close_fail:
192         close_noerr(ctdb->fd);
193 free_fail:
194         free_noerr(ctdb);
195 fail:
196         return NULL;
197 }
198
199 void ctdb_disconnect(struct ctdb_connection *ctdb)
200 {
201         struct ctdb_request *i;
202
203         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
204
205         while ((i = ctdb->outq) != NULL) {
206                 DLIST_REMOVE(ctdb->outq, i);
207                 ctdb_request_free(ctdb, i);
208         }
209
210         while ((i = ctdb->doneq) != NULL) {
211                 DLIST_REMOVE(ctdb->doneq, i);
212                 ctdb_request_free(ctdb, i);
213         }
214
215         if (ctdb->in)
216                 free_io_elem(ctdb->in);
217
218         remove_message_handlers(ctdb);
219
220         close(ctdb->fd);
221         /* Just in case they try to reuse */
222         ctdb->fd = -1;
223         free(ctdb);
224 }
225
226 int ctdb_get_fd(struct ctdb_connection *ctdb)
227 {
228         return ctdb->fd;
229 }
230
231 int ctdb_which_events(struct ctdb_connection *ctdb)
232 {
233         int events = POLLIN;
234
235         if (ctdb->outq)
236                 events |= POLLOUT;
237         return events;
238 }
239
240 struct ctdb_request *new_ctdb_request(size_t len,
241                                       ctdb_callback_t cb, void *cbdata)
242 {
243         struct ctdb_request *req = malloc(sizeof(*req));
244         if (!req)
245                 return NULL;
246         req->io = new_io_elem(len);
247         if (!req->io) {
248                 free(req);
249                 return NULL;
250         }
251         req->hdr.hdr = io_elem_data(req->io, NULL);
252         req->reply = NULL;
253         req->callback = cb;
254         req->priv_data = cbdata;
255         req->extra = NULL;
256         req->extra_destructor = NULL;
257         return req;
258 }
259
260 void ctdb_request_free(struct ctdb_connection *ctdb, struct ctdb_request *req)
261 {
262         if (req->next || req->prev) {
263                 DEBUG(ctdb, LOG_ALERT,
264                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
265                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
266                 ctdb_cancel(ctdb, req);
267                 return;
268         }
269         if (req->extra_destructor) {
270                 req->extra_destructor(ctdb, req);
271         }
272         if (req->reply) {
273                 free_io_elem(req->reply);
274         }
275         free_io_elem(req->io);
276         free(req);
277 }
278
279 /* Sanity-checking wrapper for reply. */
280 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_connection *ctdb,
281                                                  struct ctdb_request *req,
282                                                  uint32_t callid)
283 {
284         size_t len;
285         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
286
287         /* Library user error if this isn't a reply to a call. */
288         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
289                 errno = EINVAL;
290                 DEBUG(ctdb, LOG_ALERT,
291                       "This was not a ctdbd call request: operation %u",
292                       req->hdr.hdr->operation);
293                 return NULL;
294         }
295
296         if (req->hdr.call->callid != callid) {
297                 errno = EINVAL;
298                 DEBUG(ctdb, LOG_ALERT,
299                       "This was not a ctdbd %u call request: %u",
300                       callid, req->hdr.call->callid);
301                 return NULL;
302         }
303
304         /* ctdbd or our error if this isn't a reply call. */
305         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
306                 errno = EIO;
307                 DEBUG(ctdb, LOG_CRIT,
308                       "Invalid ctdbd call reply: len %zu, operation %u",
309                       len, inhdr->hdr.operation);
310                 return NULL;
311         }
312
313         return inhdr;
314 }
315
316 /* Sanity-checking wrapper for reply. */
317 struct ctdb_reply_control *unpack_reply_control(struct ctdb_connection *ctdb,
318                                                 struct ctdb_request *req,
319                                                 enum ctdb_controls control)
320 {
321         size_t len;
322         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
323
324         /* Library user error if this isn't a reply to a call. */
325         if (len < sizeof(*inhdr)) {
326                 errno = EINVAL;
327                 DEBUG(ctdb, LOG_ALERT,
328                       "Short ctdbd control reply: %zu bytes", len);
329                 return NULL;
330         }
331         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
332                 errno = EINVAL;
333                 DEBUG(ctdb, LOG_ALERT,
334                       "This was not a ctdbd control request: operation %u",
335                       req->hdr.hdr->operation);
336                 return NULL;
337         }
338
339         /* ... or if it was a different control from what we expected. */
340         if (req->hdr.control->opcode != control) {
341                 errno = EINVAL;
342                 DEBUG(ctdb, LOG_ALERT,
343                       "This was not an opcode %u ctdbd control request: %u",
344                       control, req->hdr.control->opcode);
345                 return NULL;
346         }
347
348         /* ctdbd or our error if this isn't a reply call. */
349         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
350                 errno = EIO;
351                 DEBUG(ctdb, LOG_CRIT,
352                       "Invalid ctdbd control reply: operation %u",
353                       inhdr->hdr.operation);
354                 return NULL;
355         }
356
357         return inhdr;
358 }
359
360 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
361 {
362         struct ctdb_req_header *hdr;
363         size_t len;
364         struct ctdb_request *i;
365
366         hdr = io_elem_data(in, &len);
367         /* FIXME: use len to check packet! */
368
369         if (hdr->operation == CTDB_REQ_MESSAGE) {
370                 deliver_message(ctdb, hdr);
371                 return;
372         }
373
374         for (i = ctdb->doneq; i; i = i->next) {
375                 if (i->hdr.hdr->reqid == hdr->reqid) {
376                         DLIST_REMOVE(ctdb->doneq, i);
377                         i->reply = in;
378                         i->callback(ctdb, i, i->priv_data);
379                         return;
380                 }
381         }
382         DEBUG(ctdb, LOG_WARNING,
383               "Unexpected ctdbd request reply: operation %u reqid %u",
384               hdr->operation, hdr->reqid);
385         free_io_elem(in);
386 }
387
388 /* Remove "harmless" errors. */
389 static ssize_t real_error(ssize_t ret)
390 {
391         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
392                 return 0;
393         return ret;
394 }
395
396 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
397 {
398         if (ctdb->broken) {
399                 return false;
400         }
401
402         if (holding_lock(ctdb)) {
403                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
404         }
405
406         if (revents & POLLOUT) {
407                 while (ctdb->outq) {
408                         if (real_error(write_io_elem(ctdb->fd,
409                                                      ctdb->outq->io)) < 0) {
410                                 DEBUG(ctdb, LOG_ERR,
411                                       "ctdb_service: error writing to ctdbd");
412                                 ctdb->broken = true;
413                                 return false;
414                         }
415                         if (io_elem_finished(ctdb->outq->io)) {
416                                 struct ctdb_request *done = ctdb->outq;
417                                 DLIST_REMOVE(ctdb->outq, done);
418                                 /* We add at the head: any dead ones
419                                  * sit and end. */
420                                 DLIST_ADD(ctdb->doneq, done);
421                         }
422                 }
423         }
424
425         while (revents & POLLIN) {
426                 int ret;
427                 int num_ready = 0;
428
429                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
430                         DEBUG(ctdb, LOG_ERR,
431                               "ctdb_service: ioctl(FIONREAD) %d", errno);
432                         ctdb->broken = true;
433                         return false;
434                 }
435                 if (num_ready == 0) {
436                         /* the descriptor has been closed or we have all our data */
437                         break;
438                 }
439
440
441                 if (!ctdb->in) {
442                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
443                         if (!ctdb->in) {
444                                 DEBUG(ctdb, LOG_ERR,
445                                       "ctdb_service: allocating readbuf");
446                                 ctdb->broken = true;
447                                 return false;
448                         }
449                 }
450
451                 ret = read_io_elem(ctdb->fd, ctdb->in);
452                 if (real_error(ret) < 0 || ret == 0) {
453                         /* They closed fd? */
454                         if (ret == 0)
455                                 errno = EBADF;
456                         DEBUG(ctdb, LOG_ERR,
457                               "ctdb_service: error reading from ctdbd");
458                         ctdb->broken = true;
459                         return false;
460                 } else if (ret < 0) {
461                         /* No progress, stop loop. */
462                         break;
463                 } else if (io_elem_finished(ctdb->in)) {
464                         io_elem_queue(ctdb, ctdb->in);
465                         ctdb->in = NULL;
466                 }
467         }
468
469
470         while (ctdb->inqueue != NULL) {
471                 struct io_elem *io = ctdb->inqueue;
472
473                 io_elem_dequeue(ctdb, io);
474                 handle_incoming(ctdb, io);
475         }
476
477         return true;
478 }
479
480 /* This is inefficient.  We could pull in idtree.c. */
481 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
482 {
483         struct ctdb_request *i;
484
485         for (i = ctdb->outq; i; i = i->next) {
486                 if (i->hdr.hdr->reqid == reqid) {
487                         return true;
488                 }
489         }
490         for (i = ctdb->doneq; i; i = i->next) {
491                 if (i->hdr.hdr->reqid == reqid) {
492                         return true;
493                 }
494         }
495         return false;
496 }
497
498 uint32_t new_reqid(struct ctdb_connection *ctdb)
499 {
500         while (reqid_used(ctdb, ctdb->next_id)) {
501                 ctdb->next_id++;
502         }
503         return ctdb->next_id++;
504 }
505
506 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
507                                               uint32_t opcode,
508                                               uint32_t destnode,
509                                               const void *extra_data,
510                                               size_t extra,
511                                               ctdb_callback_t callback,
512                                               void *cbdata)
513 {
514         struct ctdb_request *req;
515         struct ctdb_req_control *pkt;
516
517         req = new_ctdb_request(offsetof(struct ctdb_req_control, data) + extra, callback, cbdata);
518         if (!req)
519                 return NULL;
520
521         io_elem_init_req_header(req->io,
522                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
523
524         pkt = req->hdr.control;
525         pkt->pad = 0;
526         pkt->opcode = opcode;
527         pkt->srvid = 0;
528         pkt->client_id = 0;
529         pkt->flags = 0;
530         pkt->datalen = extra;
531         memcpy(pkt->data, extra_data, extra);
532         DLIST_ADD(ctdb->outq, req);
533         return req;
534 }
535
536 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
537                           struct ctdb_request *req,
538                           void *unused)
539 {
540         ctdb_request_free(ctdb, req);
541 }
542
543 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
544 {
545         if (!req->next && !req->prev) {
546                 DEBUG(ctdb, LOG_ALERT,
547                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
548                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
549                 ctdb_request_free(ctdb, req);
550                 return;
551         }
552
553         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
554               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
555
556         /* FIXME: If it's not sent, we could just free it right now. */
557         req->callback = ctdb_cancel_callback;
558 }
559
560 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
561 {
562         cleanup_locks(ctdb, db);
563         tdb_close(db->tdb);
564         free(db);
565 }
566
567 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
568                                     struct ctdb_request *req,
569                                     void *_db)
570 {
571         struct ctdb_db *db = _db;
572
573         /* Do callback on original request. */
574         db->callback(ctdb, req->extra, db->private_data);
575 }
576
577 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
578                                    struct ctdb_request *req)
579 {
580         struct ctdb_request *dbpath_req = req->extra;
581         struct ctdb_reply_control *reply;
582         struct ctdb_db *db = req->priv_data;
583         uint32_t tdb_flags = db->tdb_flags;
584         struct tdb_logging_context log;
585
586         /* Never sent the dbpath request?  We've failed. */
587         if (!dbpath_req) {
588                 /* FIXME: Save errno? */
589                 errno = EINVAL;
590                 return NULL;
591         }
592
593         reply = unpack_reply_control(ctdb, dbpath_req, CTDB_CONTROL_GETDBPATH);
594         if (!reply) {
595                 return NULL;
596         }
597         if (reply->status != 0) {
598                 DEBUG(db->ctdb, LOG_ERR,
599                       "ctdb_attachdb_recv: reply status %i", reply->status);
600                 return NULL;
601         }
602
603         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
604         tdb_flags |= TDB_DISALLOW_NESTING;
605
606         log.log_fn = ctdb_tdb_log_bridge;
607         log.log_private = ctdb;
608         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
609                               &log, NULL);
610         if (db->tdb == NULL) {
611                 DEBUG(db->ctdb, LOG_ERR,
612                       "ctdb_attachdb_recv: failed to tdb_open %s",
613                       (char *)reply->data);
614                 return NULL;
615         }
616
617         /* Finally, separate the db from the request (see destroy_req_db). */
618         req->priv_data = NULL;
619         DEBUG(db->ctdb, LOG_DEBUG,
620               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
621         return db;
622 }
623
624 static void attachdb_done(struct ctdb_connection *ctdb,
625                           struct ctdb_request *req,
626                           void *_db)
627 {
628         struct ctdb_db *db = _db;
629         struct ctdb_request *req2;
630         struct ctdb_reply_control *reply;
631         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
632
633         if (db->persistent) {
634                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
635         }
636
637         reply = unpack_reply_control(ctdb, req, control);
638         if (!reply || reply->status != 0) {
639                 if (reply) {
640                         DEBUG(ctdb, LOG_ERR,
641                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
642                               reply->status);
643                 }
644                 /* We failed.  Hand request to user and have them discover it
645                  * via ctdb_attachdb_recv. */
646                 db->callback(ctdb, req, db->private_data);
647                 return;
648         }
649         db->id = *(uint32_t *)reply->data;
650
651         /* Now we do another call, to get the dbpath. */
652         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
653                                         CTDB_CURRENT_NODE,
654                                         &db->id, sizeof(db->id),
655                                         attachdb_getdbpath_done, db);
656         if (!req2) {
657                 DEBUG(db->ctdb, LOG_ERR,
658                       "ctdb_attachdb_send(async): failed to allocate");
659                 db->callback(ctdb, req, db->private_data);
660                 return;
661         }
662         req->extra = req2;
663         req2->extra = req;
664         DEBUG(db->ctdb, LOG_DEBUG,
665               "ctdb_attachdb_send(async): created getdbpath request");
666 }
667
668 static void destroy_req_db(struct ctdb_connection *ctdb,
669                            struct ctdb_request *req)
670 {
671         /* Incomplete db is in priv_data. */
672         free(req->priv_data);
673         /* second request is chained off this one. */
674         if (req->extra) {
675                 ctdb_request_free(ctdb, req->extra);
676         }
677 }
678
679 struct ctdb_request *
680 ctdb_attachdb_send(struct ctdb_connection *ctdb,
681                    const char *name, bool persistent, uint32_t tdb_flags,
682                    ctdb_callback_t callback, void *private_data)
683 {
684         struct ctdb_request *req;
685         struct ctdb_db *db;
686         uint32_t opcode;
687
688         /* FIXME: Search if db already open. */
689         db = malloc(sizeof(*db));
690         if (!db) {
691                 return NULL;
692         }
693
694         if (persistent) {
695                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
696         } else {
697                 opcode = CTDB_CONTROL_DB_ATTACH;
698         }
699
700         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
701                                        strlen(name) + 1, attachdb_done, db);
702         if (!req) {
703                 DEBUG(ctdb, LOG_ERR,
704                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
705                 free(db);
706                 return NULL;
707         }
708
709         db->ctdb = ctdb;
710         db->tdb_flags = tdb_flags;
711         db->persistent = persistent;
712         db->callback = callback;
713         db->private_data = private_data;
714
715         req->extra_destructor = destroy_req_db;
716         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
717         req->extra = NULL;
718
719         /* Flags get overloaded into srvid. */
720         req->hdr.control->srvid = tdb_flags;
721         DEBUG(db->ctdb, LOG_DEBUG,
722               "ctdb_attachdb_send: DB_ATTACH request %p", req);
723         return req;
724 }
725
726 static unsigned long lock_magic(struct ctdb_lock *lock)
727 {
728         /* A non-zero magic specific to this structure. */
729         return ((unsigned long)lock->key.dptr
730                 ^ (((unsigned long)lock->key.dptr) << 16)
731                 ^ 0xBADC0FFEEBADC0DEULL)
732                 | 1;
733 }
734
735 /* This is only called on locks before they're held. */
736 static void free_lock(struct ctdb_lock *lock)
737 {
738         if (lock->held_magic) {
739                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
740                       "free_lock invalid lock %p", lock);
741         }
742         free(lock->hdr);
743         free(lock);
744 }
745
746
747 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
748 {
749         if (lock->held_magic != lock_magic(lock)) {
750                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
751                       "ctdb_release_lock invalid lock %p", lock);
752         } else if (lock->ctdb_db != ctdb_db) {
753                 errno = EBADF;
754                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
755                       "ctdb_release_lock: wrong ctdb_db.");
756         } else {
757                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
758                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
759                       "ctdb_release_lock %p", lock);
760                 remove_lock(lock->ctdb_db->ctdb, lock);
761         }
762         lock->held_magic = 0;
763         free_lock(lock);
764 }
765
766
767 /* We keep the lock if local node is the dmaster. */
768 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
769 {
770         struct ctdb_ltdb_header *hdr;
771
772         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
773                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
774                       "ctdb_readrecordlock_async: failed to chainlock");
775                 return NULL;
776         }
777
778         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
779         if (hdr && lock->readonly && (hdr->flags & CTDB_REC_RO_HAVE_READONLY) ) {
780                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
781                       "ctdb_readrecordlock_async: got local lock for ro");
782                 lock->held_magic = lock_magic(lock);
783                 lock->hdr = hdr;
784                 add_lock(lock->ctdb_db->ctdb, lock);
785                 return true;
786         }
787         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
788                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
789                       "ctdb_readrecordlock_async: got local lock");
790                 lock->held_magic = lock_magic(lock);
791                 lock->hdr = hdr;
792                 add_lock(lock->ctdb_db->ctdb, lock);
793                 return true;
794         }
795
796         /* we dont have the record locally,
797          * drop to writelock to force a migration
798          */
799         if (!hdr && lock->readonly) {
800                 lock->readonly = false;
801         }
802
803         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
804         free(hdr);
805         return NULL;
806 }
807
808 /* If they shutdown before we hand them the lock, we free it here. */
809 static void destroy_lock(struct ctdb_connection *ctdb,
810                          struct ctdb_request *req)
811 {
812         free_lock(req->extra);
813 }
814
815 static void readrecordlock_retry(struct ctdb_connection *ctdb,
816                                  struct ctdb_request *req, void *private)
817 {
818         struct ctdb_lock *lock = req->extra;
819         struct ctdb_reply_call *reply;
820         TDB_DATA data;
821
822         /* OK, we've received reply to fetch-with-header migration */
823         reply = unpack_reply_call(ctdb, req, CTDB_FETCH_WITH_HEADER_FUNC);
824         if (!reply || reply->status != 0) {
825                 if (reply) {
826                         DEBUG(ctdb, LOG_ERR,
827                               "ctdb_readrecordlock_async(async):"
828                               " FETCH_WITH_HEADER_FUNC returned %i", reply->status);
829                 }
830                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
831                 ctdb_request_free(ctdb, req); /* Also frees lock. */
832                 return;
833         }
834
835         /* Can we get lock now? */
836         if (try_readrecordlock(lock, &data)) {
837                 /* Now it's their responsibility to free lock & request! */
838                 req->extra_destructor = NULL;
839                 lock->callback(lock->ctdb_db, lock, data, private);
840                 ctdb_request_free(ctdb, req);
841                 return;
842         }
843
844         /* Retransmit the same request again (we lost race). */
845         io_elem_reset(req->io);
846         DLIST_ADD(ctdb->outq, req);
847 }
848
849 static bool
850 ctdb_readrecordlock_internal(struct ctdb_db *ctdb_db, TDB_DATA key,
851                              bool readonly,
852                              ctdb_rrl_callback_t callback, void *cbdata)
853 {
854         struct ctdb_request *req;
855         struct ctdb_lock *lock;
856         TDB_DATA data;
857
858         if (holding_lock(ctdb_db->ctdb)) {
859                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
860                       "ctdb_readrecordlock_async: already holding lock");
861                 return false;
862         }
863
864         /* Setup lock */
865         lock = malloc(sizeof(*lock) + key.dsize);
866         if (!lock) {
867                 DEBUG(ctdb_db->ctdb, LOG_ERR,
868                       "ctdb_readrecordlock_async: lock allocation failed");
869                 return false;
870         }
871         lock->key.dptr = (void *)(lock + 1);
872         memcpy(lock->key.dptr, key.dptr, key.dsize);
873         lock->key.dsize = key.dsize;
874         lock->ctdb_db = ctdb_db;
875         lock->hdr = NULL;
876         lock->held_magic = 0;
877         lock->readonly = readonly;
878
879         /* Fast path. */
880         if (try_readrecordlock(lock, &data)) {
881                 callback(ctdb_db, lock, data, cbdata);
882                 return true;
883         }
884
885         /* Slow path: create request. */
886         req = new_ctdb_request(offsetof(struct ctdb_req_call, data)
887                                + key.dsize, readrecordlock_retry, cbdata);
888         if (!req) {
889                 DEBUG(ctdb_db->ctdb, LOG_ERR,
890                       "ctdb_readrecordlock_async: allocation failed");
891                 free_lock(lock);
892                 return NULL;
893         }
894         req->extra = lock;
895         req->extra_destructor = destroy_lock;
896         /* We store the original callback in the lock, and use our own. */
897         lock->callback = callback;
898
899         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
900                                 new_reqid(ctdb_db->ctdb));
901
902         if (lock->readonly) {
903                 req->hdr.call->flags = CTDB_WANT_READONLY;
904         } else {
905                 req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
906         }
907         req->hdr.call->db_id = ctdb_db->id;
908         req->hdr.call->callid = CTDB_FETCH_WITH_HEADER_FUNC;
909         req->hdr.call->hopcount = 0;
910         req->hdr.call->keylen = key.dsize;
911         req->hdr.call->calldatalen = 0;
912         memcpy(req->hdr.call->data, key.dptr, key.dsize);
913         DLIST_ADD(ctdb_db->ctdb->outq, req);
914         return true;
915 }
916
917 bool
918 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
919                           ctdb_rrl_callback_t callback, void *cbdata)
920 {
921         return ctdb_readrecordlock_internal(ctdb_db, key,
922                         false,
923                         callback, cbdata);
924 }
925
926 bool
927 ctdb_readonlyrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
928                           ctdb_rrl_callback_t callback, void *cbdata)
929 {
930         return ctdb_readrecordlock_internal(ctdb_db, key,
931                         true,
932                         callback, cbdata);
933 }
934
935 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
936                       struct ctdb_lock *lock, TDB_DATA data)
937 {
938         if (lock->readonly) {
939                 errno = EBADF;
940                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
941                       "ctdb_writerecord: Can not write, read-only record.");
942                 return false;
943         }
944
945         if (lock->ctdb_db != ctdb_db) {
946                 errno = EBADF;
947                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
948                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
949                 return false;
950         }
951
952         if (lock->held_magic != lock_magic(lock)) {
953                 errno = EBADF;
954                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
955                       "ctdb_writerecord: Can not write. Lock has been released.");
956                 return false;
957         }
958                 
959         if (ctdb_db->persistent) {
960                 errno = EINVAL;
961                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
962                       "ctdb_writerecord: cannot write to persistent db");
963                 return false;
964         }
965
966         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
967         case 0:
968                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
969                       "ctdb_writerecord: optimized away noop write.");
970                 /* fall thru */
971         case 1:
972                 return true;
973
974         default:
975                 switch (errno) {
976                 case ENOMEM:
977                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
978                               "ctdb_writerecord: out of memory.");
979                         break;
980                 case EINVAL:
981                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
982                               "ctdb_writerecord: record changed under lock?");
983                         break;
984                 default: /* TDB already logged. */
985                         break;
986                 }
987                 return false;
988         }
989 }
990
991
992 struct ctdb_traverse_state {
993         struct ctdb_request *handle;
994         struct ctdb_db *ctdb_db;
995         uint64_t srvid;
996
997         ctdb_traverse_callback_t callback;
998         void *cbdata;
999 };
1000
1001 static void traverse_remhnd_cb(struct ctdb_connection *ctdb,
1002                         struct ctdb_request *req, void *private_data)
1003 {
1004         struct ctdb_traverse_state *state = private_data;
1005
1006         if (!ctdb_remove_message_handler_recv(ctdb, state->handle)) {
1007                 DEBUG(ctdb, LOG_ERR,
1008                                 "Failed to remove message handler for"
1009                                 " traverse.");
1010                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1011                                 TRAVERSE_STATUS_ERROR,
1012                                 tdb_null, tdb_null,
1013                                 state->cbdata);
1014         }
1015         ctdb_request_free(ctdb, state->handle);
1016         state->handle = NULL;
1017         free(state);
1018 }
1019         
1020 static void msg_h(struct ctdb_connection *ctdb, uint64_t srvid,
1021            TDB_DATA data, void *private_data)
1022 {
1023         struct ctdb_traverse_state *state = private_data;
1024         struct ctdb_db *ctdb_db = state->ctdb_db;
1025         struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
1026         TDB_DATA key;
1027
1028         if (data.dsize < sizeof(uint32_t) ||
1029             d->length != data.dsize) {
1030                 DEBUG(ctdb, LOG_ERR,
1031                         "Bad data size %u in traverse_handler",
1032                         (unsigned)data.dsize);
1033                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1034                                 TRAVERSE_STATUS_ERROR,
1035                                 tdb_null, tdb_null,
1036                                 state->cbdata);
1037                 state->handle = ctdb_remove_message_handler_send(
1038                                 state->ctdb_db->ctdb, state->srvid,
1039                                 msg_h, state,
1040                                 traverse_remhnd_cb, state);
1041                 return;
1042         }
1043
1044         key.dsize = d->keylen;
1045         key.dptr  = &d->data[0];
1046         data.dsize = d->datalen;
1047         data.dptr = &d->data[d->keylen];
1048
1049         if (key.dsize == 0 && data.dsize == 0) {
1050                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1051                                 TRAVERSE_STATUS_FINISHED,
1052                                 tdb_null, tdb_null,
1053                                 state->cbdata);
1054                 state->handle = ctdb_remove_message_handler_send(
1055                                 state->ctdb_db->ctdb, state->srvid,
1056                                 msg_h, state,
1057                                 traverse_remhnd_cb, state);
1058                 return;
1059         }
1060
1061         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1062                 /* empty records are deleted records in ctdb */
1063                 return;
1064         }
1065
1066         data.dsize -= sizeof(struct ctdb_ltdb_header);
1067         data.dptr  += sizeof(struct ctdb_ltdb_header);
1068
1069         if (state->callback(ctdb, ctdb_db,
1070                         TRAVERSE_STATUS_RECORD,
1071                         key, data, state->cbdata) != 0) {
1072                 state->handle = ctdb_remove_message_handler_send(
1073                                 state->ctdb_db->ctdb, state->srvid,
1074                                 msg_h, state,
1075                                 traverse_remhnd_cb, state);
1076                 return;
1077         }
1078 }
1079
1080 static void traverse_start_cb(struct ctdb_connection *ctdb,
1081                         struct ctdb_request *req, void *private_data)
1082 {
1083         struct ctdb_traverse_state *state = private_data;
1084
1085         ctdb_request_free(ctdb, state->handle);
1086         state->handle = NULL;
1087 }
1088
1089 static void traverse_msghnd_cb(struct ctdb_connection *ctdb,
1090                         struct ctdb_request *req, void *private_data)
1091 {
1092         struct ctdb_traverse_state *state = private_data;
1093         struct ctdb_db *ctdb_db = state->ctdb_db;
1094         struct ctdb_traverse_start t;
1095
1096         if (!ctdb_set_message_handler_recv(ctdb, state->handle)) {
1097                 DEBUG(ctdb, LOG_ERR,
1098                                 "Failed to register message handler for"
1099                                 " traverse.");
1100                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1101                                 TRAVERSE_STATUS_ERROR,
1102                                 tdb_null, tdb_null,
1103                                 state->cbdata);
1104                 ctdb_request_free(ctdb, state->handle);
1105                 state->handle = NULL;
1106                 free(state);
1107                 return;
1108         }
1109         ctdb_request_free(ctdb, state->handle);
1110         state->handle = NULL;
1111
1112         t.db_id = ctdb_db->id;
1113         t.srvid = state->srvid;
1114         t.reqid = 0;
1115
1116         state->handle = new_ctdb_control_request(ctdb,
1117                                 CTDB_CONTROL_TRAVERSE_START,
1118                                 CTDB_CURRENT_NODE,
1119                                 &t, sizeof(t),
1120                                 traverse_start_cb, state);
1121         if (state->handle == NULL) {
1122                 DEBUG(ctdb, LOG_ERR,
1123                                 "ctdb_traverse_async:"
1124                                 " failed to send traverse_start control");
1125                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1126                                 TRAVERSE_STATUS_ERROR,
1127                                 tdb_null, tdb_null,
1128                                 state->cbdata);
1129                 state->handle = ctdb_remove_message_handler_send(
1130                                 state->ctdb_db->ctdb, state->srvid,
1131                                 msg_h, state,
1132                                 traverse_remhnd_cb, state);
1133                 return;
1134         }
1135 }
1136
1137 bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
1138                          ctdb_traverse_callback_t callback, void *cbdata)
1139 {
1140         struct ctdb_connection *ctdb = ctdb_db->ctdb;
1141         struct ctdb_traverse_state *state;
1142         static uint32_t tid = 0;
1143
1144         state = malloc(sizeof(struct ctdb_traverse_state));
1145         if (state == NULL) {
1146                 DEBUG(ctdb, LOG_ERR,
1147                                 "ctdb_traverse_async: no memory."
1148                                 " allocate state failed");
1149                 return false;
1150         }
1151
1152         tid++;
1153         state->srvid = CTDB_SRVID_TRAVERSE_RANGE|tid;
1154
1155         state->callback = callback;
1156         state->cbdata   = cbdata;
1157         state->ctdb_db  = ctdb_db;
1158
1159         state->handle = ctdb_set_message_handler_send(ctdb_db->ctdb,
1160                                 state->srvid,
1161                                 msg_h, state,
1162                                 traverse_msghnd_cb, state);
1163         if (state->handle == NULL) {
1164                 DEBUG(ctdb, LOG_ERR,
1165                         "ctdb_traverse_async:"
1166                         " failed ctdb_set_message_handler_send");
1167                 free(state);
1168                 return false;
1169         }
1170
1171         return true;
1172 }