tools/ctdb: Pass memory context for returning nodes in parse_nodestring
[ctdb.git] / libctdb / ctdb.c
1 /*
2    core of libctdb
3
4    Copyright (C) Rusty Russell 2010
5    Copyright (C) Ronnie Sahlberg 2011
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "config.h"
21 #include <sys/time.h>
22 #include <sys/socket.h>
23 #include <string.h>
24 #include <ctdb.h>
25 #include <poll.h>
26 #include <errno.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 #include <stdlib.h>
30 #include <sys/socket.h>
31 #include <sys/un.h>
32 #include <sys/ioctl.h>
33 #include <time.h>
34 #include "libctdb_private.h"
35 #include "io_elem.h"
36 #include "local_tdb.h"
37 #include "messages.h"
38 #include <dlinklist.h>
39 #include <ctdb_protocol.h>
40
41 /* Remove type-safety macros. */
42 #undef ctdb_attachdb_send
43 #undef ctdb_readrecordlock_async
44 #undef ctdb_readonlyrecordlock_async
45 #undef ctdb_connect
46
47 struct ctdb_lock {
48         struct ctdb_lock *next, *prev;
49
50         struct ctdb_db *ctdb_db;
51         TDB_DATA key;
52
53         /* Is this a request for read-only lock ? */
54         bool readonly;
55
56         /* This will always be set by the time user sees this. */
57         unsigned long held_magic;
58         struct ctdb_ltdb_header *hdr;
59
60         /* For convenience, we stash original callback here. */
61         ctdb_rrl_callback_t callback;
62 };
63
64 struct ctdb_db {
65         struct ctdb_connection *ctdb;
66         bool persistent;
67         uint32_t tdb_flags;
68         uint32_t id;
69         struct tdb_context *tdb;
70
71         ctdb_callback_t callback;
72         void *private_data;
73 };
74
75 static void remove_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
76 {
77         DLIST_REMOVE(ctdb->locks, lock);
78 }
79
80 /* FIXME: for thread safety, need tid info too. */
81 static bool holding_lock(struct ctdb_connection *ctdb)
82 {
83         /* For the moment, you can't ever hold more than 1 lock. */
84         return (ctdb->locks != NULL);
85 }
86
87 static void add_lock(struct ctdb_connection *ctdb, struct ctdb_lock *lock)
88 {
89         DLIST_ADD(ctdb->locks, lock);
90 }
91
92 static void cleanup_locks(struct ctdb_connection *ctdb, struct ctdb_db *db)
93 {
94         struct ctdb_lock *i, *next;
95
96         for (i = ctdb->locks; i; i = next) {
97                 /* Grab next pointer, as release_lock will free i */
98                 next = i->next;
99                 if (i->ctdb_db == db) {
100                         ctdb_release_lock(db, i);
101                 }
102         }
103 }
104
105 /* FIXME: Could be in shared util code with rest of ctdb */
106 static void close_noerr(int fd)
107 {
108         int olderr = errno;
109         close(fd);
110         errno = olderr;
111 }
112
113 /* FIXME: Could be in shared util code with rest of ctdb */
114 static void free_noerr(void *p)
115 {
116         int olderr = errno;
117         free(p);
118         errno = olderr;
119 }
120
121 /* FIXME: Could be in shared util code with rest of ctdb */
122 static void set_nonblocking(int fd)
123 {
124         unsigned v;
125         v = fcntl(fd, F_GETFL, 0);
126         fcntl(fd, F_SETFL, v | O_NONBLOCK);
127 }
128
129 /* FIXME: Could be in shared util code with rest of ctdb */
130 static void set_close_on_exec(int fd)
131 {
132         unsigned v;
133         v = fcntl(fd, F_GETFD, 0);
134         fcntl(fd, F_SETFD, v | FD_CLOEXEC);
135 }
136
137 static void set_pnn(struct ctdb_connection *ctdb,
138                     struct ctdb_request *req,
139                     void *unused)
140 {
141         if (!ctdb_getpnn_recv(ctdb, req, &ctdb->pnn)) {
142                 DEBUG(ctdb, LOG_CRIT,
143                       "ctdb_connect(async): failed to get pnn");
144                 ctdb->broken = true;
145         }
146         ctdb_request_free(req);
147 }
148
149 struct ctdb_connection *ctdb_connect(const char *addr,
150                                      ctdb_log_fn_t log_fn, void *log_priv)
151 {
152         struct ctdb_connection *ctdb;
153         struct sockaddr_un sun;
154
155         ctdb = malloc(sizeof(*ctdb));
156         if (!ctdb) {
157                 /* With no format string, we hope it doesn't use ap! */
158                 va_list ap;
159                 memset(&ap, 0, sizeof(ap));
160                 errno = ENOMEM;
161                 log_fn(log_priv, LOG_ERR, "ctdb_connect: no memory", ap);
162                 goto fail;
163         }
164         ctdb->pnn = -1;
165         ctdb->outq = NULL;
166         ctdb->doneq = NULL;
167         ctdb->in = NULL;
168         ctdb->inqueue = NULL;
169         ctdb->message_handlers = NULL;
170         ctdb->next_id = 0;
171         ctdb->broken = false;
172         ctdb->log = log_fn;
173         ctdb->log_priv = log_priv;
174         ctdb->locks = NULL;
175
176         memset(&sun, 0, sizeof(sun));
177         sun.sun_family = AF_UNIX;
178         if (!addr)
179                 addr = CTDB_PATH;
180         strncpy(sun.sun_path, addr, sizeof(sun.sun_path)-1);
181         ctdb->fd = socket(AF_UNIX, SOCK_STREAM, 0);
182         if (ctdb->fd < 0)
183                 goto free_fail;
184
185         if (connect(ctdb->fd, (struct sockaddr *)&sun, sizeof(sun)) == -1)
186                 goto close_fail;
187
188         set_nonblocking(ctdb->fd);
189         set_close_on_exec(ctdb->fd);
190
191         /* Immediately queue a request to get our pnn. */
192         if (!ctdb_getpnn_send(ctdb, CTDB_CURRENT_NODE, set_pnn, NULL))
193                 goto close_fail;
194
195         return ctdb;
196
197 close_fail:
198         close_noerr(ctdb->fd);
199 free_fail:
200         free_noerr(ctdb);
201 fail:
202         return NULL;
203 }
204
205 void ctdb_disconnect(struct ctdb_connection *ctdb)
206 {
207         struct ctdb_request *i;
208
209         DEBUG(ctdb, LOG_DEBUG, "ctdb_disconnect");
210
211         while ((i = ctdb->outq) != NULL) {
212                 DLIST_REMOVE(ctdb->outq, i);
213                 ctdb_request_free(i);
214         }
215
216         while ((i = ctdb->doneq) != NULL) {
217                 DLIST_REMOVE(ctdb->doneq, i);
218                 ctdb_request_free(i);
219         }
220
221         if (ctdb->in)
222                 free_io_elem(ctdb->in);
223
224         remove_message_handlers(ctdb);
225
226         close(ctdb->fd);
227         /* Just in case they try to reuse */
228         ctdb->fd = -1;
229         free(ctdb);
230 }
231
232 int ctdb_get_fd(struct ctdb_connection *ctdb)
233 {
234         return ctdb->fd;
235 }
236
237 int ctdb_which_events(struct ctdb_connection *ctdb)
238 {
239         int events = POLLIN;
240
241         if (ctdb->outq)
242                 events |= POLLOUT;
243         return events;
244 }
245
246 struct ctdb_request *new_ctdb_request(struct ctdb_connection *ctdb, size_t len,
247                                       ctdb_callback_t cb, void *cbdata)
248 {
249         struct ctdb_request *req = malloc(sizeof(*req));
250         if (!req)
251                 return NULL;
252         req->io = new_io_elem(len);
253         if (!req->io) {
254                 free(req);
255                 return NULL;
256         }
257         req->ctdb = ctdb;
258         req->hdr.hdr = io_elem_data(req->io, NULL);
259         req->reply = NULL;
260         req->callback = cb;
261         req->priv_data = cbdata;
262         req->extra = NULL;
263         req->extra_destructor = NULL;
264         return req;
265 }
266
267 void ctdb_request_free(struct ctdb_request *req)
268 {
269         struct ctdb_connection *ctdb = req->ctdb;
270
271         if (req->next || req->prev) {
272                 DEBUG(ctdb, LOG_ALERT,
273                       "ctdb_request_free: request not complete! ctdb_cancel? %p (id %u)",
274                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
275                 ctdb_cancel(ctdb, req);
276                 return;
277         }
278         if (req->extra_destructor) {
279                 req->extra_destructor(ctdb, req);
280         }
281         if (req->reply) {
282                 free_io_elem(req->reply);
283         }
284         free_io_elem(req->io);
285         free(req);
286 }
287
288 /* Sanity-checking wrapper for reply. */
289 static struct ctdb_reply_call *unpack_reply_call(struct ctdb_request *req,
290                                                  uint32_t callid)
291 {
292         size_t len;
293         struct ctdb_reply_call *inhdr = io_elem_data(req->reply, &len);
294
295         /* Library user error if this isn't a reply to a call. */
296         if (req->hdr.hdr->operation != CTDB_REQ_CALL) {
297                 errno = EINVAL;
298                 DEBUG(req->ctdb, LOG_ALERT,
299                       "This was not a ctdbd call request: operation %u",
300                       req->hdr.hdr->operation);
301                 return NULL;
302         }
303
304         if (req->hdr.call->callid != callid) {
305                 errno = EINVAL;
306                 DEBUG(req->ctdb, LOG_ALERT,
307                       "This was not a ctdbd %u call request: %u",
308                       callid, req->hdr.call->callid);
309                 return NULL;
310         }
311
312         /* ctdbd or our error if this isn't a reply call. */
313         if (len < sizeof(*inhdr) || inhdr->hdr.operation != CTDB_REPLY_CALL) {
314                 errno = EIO;
315                 DEBUG(req->ctdb, LOG_CRIT,
316                       "Invalid ctdbd call reply: len %zu, operation %u",
317                       len, inhdr->hdr.operation);
318                 return NULL;
319         }
320
321         return inhdr;
322 }
323
324 /* Sanity-checking wrapper for reply. */
325 struct ctdb_reply_control *unpack_reply_control(struct ctdb_request *req,
326                                                 enum ctdb_controls control)
327 {
328         size_t len;
329         struct ctdb_reply_control *inhdr = io_elem_data(req->reply, &len);
330
331         /* Library user error if this isn't a reply to a call. */
332         if (len < sizeof(*inhdr)) {
333                 errno = EINVAL;
334                 DEBUG(req->ctdb, LOG_ALERT,
335                       "Short ctdbd control reply: %zu bytes", len);
336                 return NULL;
337         }
338         if (req->hdr.hdr->operation != CTDB_REQ_CONTROL) {
339                 errno = EINVAL;
340                 DEBUG(req->ctdb, LOG_ALERT,
341                       "This was not a ctdbd control request: operation %u",
342                       req->hdr.hdr->operation);
343                 return NULL;
344         }
345
346         /* ... or if it was a different control from what we expected. */
347         if (req->hdr.control->opcode != control) {
348                 errno = EINVAL;
349                 DEBUG(req->ctdb, LOG_ALERT,
350                       "This was not an opcode %u ctdbd control request: %u",
351                       control, req->hdr.control->opcode);
352                 return NULL;
353         }
354
355         /* ctdbd or our error if this isn't a reply call. */
356         if (inhdr->hdr.operation != CTDB_REPLY_CONTROL) {
357                 errno = EIO;
358                 DEBUG(req->ctdb, LOG_CRIT,
359                       "Invalid ctdbd control reply: operation %u",
360                       inhdr->hdr.operation);
361                 return NULL;
362         }
363
364         return inhdr;
365 }
366
367 static void handle_incoming(struct ctdb_connection *ctdb, struct io_elem *in)
368 {
369         struct ctdb_req_header *hdr;
370         size_t len;
371         struct ctdb_request *i;
372
373         hdr = io_elem_data(in, &len);
374         /* FIXME: use len to check packet! */
375
376         if (hdr->operation == CTDB_REQ_MESSAGE) {
377                 deliver_message(ctdb, hdr);
378                 return;
379         }
380
381         for (i = ctdb->doneq; i; i = i->next) {
382                 if (i->hdr.hdr->reqid == hdr->reqid) {
383                         DLIST_REMOVE(ctdb->doneq, i);
384                         i->reply = in;
385                         i->callback(ctdb, i, i->priv_data);
386                         return;
387                 }
388         }
389         DEBUG(ctdb, LOG_WARNING,
390               "Unexpected ctdbd request reply: operation %u reqid %u",
391               hdr->operation, hdr->reqid);
392         free_io_elem(in);
393 }
394
395 /* Remove "harmless" errors. */
396 static ssize_t real_error(ssize_t ret)
397 {
398         if (ret < 0 && (errno == EINTR || errno == EWOULDBLOCK))
399                 return 0;
400         return ret;
401 }
402
403 bool ctdb_service(struct ctdb_connection *ctdb, int revents)
404 {
405         if (ctdb->broken) {
406                 return false;
407         }
408
409         if (holding_lock(ctdb)) {
410                 DEBUG(ctdb, LOG_ALERT, "Do not block while holding lock!");
411         }
412
413         if (revents & POLLOUT) {
414                 while (ctdb->outq) {
415                         if (real_error(write_io_elem(ctdb->fd,
416                                                      ctdb->outq->io)) < 0) {
417                                 DEBUG(ctdb, LOG_ERR,
418                                       "ctdb_service: error writing to ctdbd");
419                                 ctdb->broken = true;
420                                 return false;
421                         }
422                         if (io_elem_finished(ctdb->outq->io)) {
423                                 struct ctdb_request *done = ctdb->outq;
424                                 DLIST_REMOVE(ctdb->outq, done);
425                                 /* We add at the head: any dead ones
426                                  * sit and end. */
427                                 DLIST_ADD(ctdb->doneq, done);
428                         }
429                 }
430         }
431
432         while (revents & POLLIN) {
433                 int ret;
434                 int num_ready = 0;
435
436                 if (ioctl(ctdb->fd, FIONREAD, &num_ready) != 0) {
437                         DEBUG(ctdb, LOG_ERR,
438                               "ctdb_service: ioctl(FIONREAD) %d", errno);
439                         ctdb->broken = true;
440                         return false;
441                 }
442                 if (num_ready == 0) {
443                         /* the descriptor has been closed or we have all our data */
444                         break;
445                 }
446
447
448                 if (!ctdb->in) {
449                         ctdb->in = new_io_elem(sizeof(struct ctdb_req_header));
450                         if (!ctdb->in) {
451                                 DEBUG(ctdb, LOG_ERR,
452                                       "ctdb_service: allocating readbuf");
453                                 ctdb->broken = true;
454                                 return false;
455                         }
456                 }
457
458                 ret = read_io_elem(ctdb->fd, ctdb->in);
459                 if (real_error(ret) < 0 || ret == 0) {
460                         /* They closed fd? */
461                         if (ret == 0)
462                                 errno = EBADF;
463                         DEBUG(ctdb, LOG_ERR,
464                               "ctdb_service: error reading from ctdbd");
465                         ctdb->broken = true;
466                         return false;
467                 } else if (ret < 0) {
468                         /* No progress, stop loop. */
469                         break;
470                 } else if (io_elem_finished(ctdb->in)) {
471                         io_elem_queue(ctdb, ctdb->in);
472                         ctdb->in = NULL;
473                 }
474         }
475
476
477         while (ctdb->inqueue != NULL) {
478                 struct io_elem *io = ctdb->inqueue;
479
480                 io_elem_dequeue(ctdb, io);
481                 handle_incoming(ctdb, io);
482         }
483
484         return true;
485 }
486
487 /* This is inefficient.  We could pull in idtree.c. */
488 static bool reqid_used(const struct ctdb_connection *ctdb, uint32_t reqid)
489 {
490         struct ctdb_request *i;
491
492         for (i = ctdb->outq; i; i = i->next) {
493                 if (i->hdr.hdr->reqid == reqid) {
494                         return true;
495                 }
496         }
497         for (i = ctdb->doneq; i; i = i->next) {
498                 if (i->hdr.hdr->reqid == reqid) {
499                         return true;
500                 }
501         }
502         return false;
503 }
504
505 uint32_t new_reqid(struct ctdb_connection *ctdb)
506 {
507         while (reqid_used(ctdb, ctdb->next_id)) {
508                 ctdb->next_id++;
509         }
510         return ctdb->next_id++;
511 }
512
513 struct ctdb_request *new_ctdb_control_request(struct ctdb_connection *ctdb,
514                                               uint32_t opcode,
515                                               uint32_t destnode,
516                                               const void *extra_data,
517                                               size_t extra,
518                                               ctdb_callback_t callback,
519                                               void *cbdata)
520 {
521         struct ctdb_request *req;
522         struct ctdb_req_control *pkt;
523
524         req = new_ctdb_request(
525                 ctdb, offsetof(struct ctdb_req_control, data) + extra,
526                 callback, cbdata);
527         if (!req)
528                 return NULL;
529
530         io_elem_init_req_header(req->io,
531                                 CTDB_REQ_CONTROL, destnode, new_reqid(ctdb));
532
533         pkt = req->hdr.control;
534         pkt->pad = 0;
535         pkt->opcode = opcode;
536         pkt->srvid = 0;
537         pkt->client_id = 0;
538         pkt->flags = 0;
539         pkt->datalen = extra;
540         memcpy(pkt->data, extra_data, extra);
541         DLIST_ADD(ctdb->outq, req);
542         return req;
543 }
544
545 void ctdb_cancel_callback(struct ctdb_connection *ctdb,
546                           struct ctdb_request *req,
547                           void *unused)
548 {
549         ctdb_request_free(req);
550 }
551
552 void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req)
553 {
554         if (!req->next && !req->prev) {
555                 DEBUG(ctdb, LOG_ALERT,
556                       "ctdb_cancel: request completed! ctdb_request_free? %p (id %u)",
557                       req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
558                 ctdb_request_free(req);
559                 return;
560         }
561
562         DEBUG(ctdb, LOG_DEBUG, "ctdb_cancel: %p (id %u)",
563               req, req->hdr.hdr ? req->hdr.hdr->reqid : 0);
564
565         /* FIXME: If it's not sent, we could just free it right now. */
566         req->callback = ctdb_cancel_callback;
567 }
568
569 void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db)
570 {
571         cleanup_locks(ctdb, db);
572         tdb_close(db->tdb);
573         free(db);
574 }
575
576 static void destroy_req_db(struct ctdb_connection *ctdb,
577                            struct ctdb_request *req);
578 static void attachdb_done(struct ctdb_connection *ctdb,
579                           struct ctdb_request *req,
580                           void *_db);
581 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
582                                     struct ctdb_request *req,
583                                     void *_db);
584
585 struct ctdb_request *
586 ctdb_attachdb_send(struct ctdb_connection *ctdb,
587                    const char *name, bool persistent, uint32_t tdb_flags,
588                    ctdb_callback_t callback, void *private_data)
589 {
590         struct ctdb_request *req;
591         struct ctdb_db *db;
592         uint32_t opcode;
593
594         /* FIXME: Search if db already open. */
595         db = malloc(sizeof(*db));
596         if (!db) {
597                 return NULL;
598         }
599
600         if (persistent) {
601                 opcode = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
602         } else {
603                 opcode = CTDB_CONTROL_DB_ATTACH;
604         }
605
606         req = new_ctdb_control_request(ctdb, opcode, CTDB_CURRENT_NODE, name,
607                                        strlen(name) + 1, attachdb_done, db);
608         if (!req) {
609                 DEBUG(ctdb, LOG_ERR,
610                       "ctdb_attachdb_send: failed allocating DB_ATTACH");
611                 free(db);
612                 return NULL;
613         }
614
615         db->ctdb = ctdb;
616         db->tdb_flags = tdb_flags;
617         db->persistent = persistent;
618         db->callback = callback;
619         db->private_data = private_data;
620
621         req->extra_destructor = destroy_req_db;
622         /* This is set non-NULL when we succeed, see ctdb_attachdb_recv */
623         req->extra = NULL;
624
625         /* Flags get overloaded into srvid. */
626         req->hdr.control->srvid = tdb_flags;
627         DEBUG(db->ctdb, LOG_DEBUG,
628               "ctdb_attachdb_send: DB_ATTACH request %p", req);
629         return req;
630 }
631
632 static void destroy_req_db(struct ctdb_connection *ctdb,
633                            struct ctdb_request *req)
634 {
635         /* Incomplete db is in priv_data. */
636         free(req->priv_data);
637         /* second request is chained off this one. */
638         if (req->extra) {
639                 ctdb_request_free(req->extra);
640         }
641 }
642
643 static void attachdb_done(struct ctdb_connection *ctdb,
644                           struct ctdb_request *req,
645                           void *_db)
646 {
647         struct ctdb_db *db = _db;
648         struct ctdb_request *req2;
649         struct ctdb_reply_control *reply;
650         enum ctdb_controls control = CTDB_CONTROL_DB_ATTACH;
651
652         if (db->persistent) {
653                 control = CTDB_CONTROL_DB_ATTACH_PERSISTENT;
654         }
655
656         reply = unpack_reply_control(req, control);
657         if (!reply || reply->status != 0) {
658                 if (reply) {
659                         DEBUG(ctdb, LOG_ERR,
660                               "ctdb_attachdb_send(async): DB_ATTACH status %i",
661                               reply->status);
662                 }
663                 /* We failed.  Hand request to user and have them discover it
664                  * via ctdb_attachdb_recv. */
665                 db->callback(ctdb, req, db->private_data);
666                 return;
667         }
668         db->id = *(uint32_t *)reply->data;
669
670         /* Now we do another call, to get the dbpath. */
671         req2 = new_ctdb_control_request(db->ctdb, CTDB_CONTROL_GETDBPATH,
672                                         CTDB_CURRENT_NODE,
673                                         &db->id, sizeof(db->id),
674                                         attachdb_getdbpath_done, db);
675         if (!req2) {
676                 DEBUG(db->ctdb, LOG_ERR,
677                       "ctdb_attachdb_send(async): failed to allocate");
678                 db->callback(ctdb, req, db->private_data);
679                 return;
680         }
681         req->extra = req2;
682         req2->extra = req;
683         DEBUG(db->ctdb, LOG_DEBUG,
684               "ctdb_attachdb_send(async): created getdbpath request");
685 }
686
687 static void attachdb_getdbpath_done(struct ctdb_connection *ctdb,
688                                     struct ctdb_request *req,
689                                     void *_db)
690 {
691         struct ctdb_db *db = _db;
692
693         /* Do callback on original request. */
694         db->callback(ctdb, req->extra, db->private_data);
695 }
696
697 struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
698                                    struct ctdb_request *req)
699 {
700         struct ctdb_request *dbpath_req = req->extra;
701         struct ctdb_reply_control *reply;
702         struct ctdb_db *db = req->priv_data;
703         uint32_t tdb_flags = db->tdb_flags;
704         struct tdb_logging_context log;
705
706         /* Never sent the dbpath request?  We've failed. */
707         if (!dbpath_req) {
708                 /* FIXME: Save errno? */
709                 errno = EINVAL;
710                 return NULL;
711         }
712
713         reply = unpack_reply_control(dbpath_req, CTDB_CONTROL_GETDBPATH);
714         if (!reply) {
715                 return NULL;
716         }
717         if (reply->status != 0) {
718                 DEBUG(db->ctdb, LOG_ERR,
719                       "ctdb_attachdb_recv: reply status %i", reply->status);
720                 return NULL;
721         }
722
723         tdb_flags = db->persistent ? TDB_DEFAULT : TDB_NOSYNC;
724         tdb_flags |= TDB_DISALLOW_NESTING;
725
726         log.log_fn = ctdb_tdb_log_bridge;
727         log.log_private = ctdb;
728         db->tdb = tdb_open_ex((char *)reply->data, 0, tdb_flags, O_RDWR, 0,
729                               &log, NULL);
730         if (db->tdb == NULL) {
731                 DEBUG(db->ctdb, LOG_ERR,
732                       "ctdb_attachdb_recv: failed to tdb_open %s",
733                       (char *)reply->data);
734                 return NULL;
735         }
736
737         /* Finally, separate the db from the request (see destroy_req_db). */
738         req->priv_data = NULL;
739         DEBUG(db->ctdb, LOG_DEBUG,
740               "ctdb_attachdb_recv: db %p, tdb %s", db, (char *)reply->data);
741         return db;
742 }
743
744 static unsigned long lock_magic(struct ctdb_lock *lock)
745 {
746         /* A non-zero magic specific to this structure. */
747         return ((unsigned long)lock->key.dptr
748                 ^ (((unsigned long)lock->key.dptr) << 16)
749                 ^ 0xBADC0FFEEBADC0DEULL)
750                 | 1;
751 }
752
753 /* This is only called on locks before they're held. */
754 static void free_lock(struct ctdb_lock *lock)
755 {
756         if (lock->held_magic) {
757                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
758                       "free_lock invalid lock %p", lock);
759         }
760         free(lock->hdr);
761         free(lock);
762 }
763
764
765 void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock)
766 {
767         if (lock->held_magic != lock_magic(lock)) {
768                 DEBUG(lock->ctdb_db->ctdb, LOG_ALERT,
769                       "ctdb_release_lock invalid lock %p", lock);
770         } else if (lock->ctdb_db != ctdb_db) {
771                 errno = EBADF;
772                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
773                       "ctdb_release_lock: wrong ctdb_db.");
774         } else {
775                 tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
776                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
777                       "ctdb_release_lock %p", lock);
778                 remove_lock(lock->ctdb_db->ctdb, lock);
779         }
780         lock->held_magic = 0;
781         free_lock(lock);
782 }
783
784
785 /* We keep the lock if local node is the dmaster. */
786 static bool try_readrecordlock(struct ctdb_lock *lock, TDB_DATA *data)
787 {
788         struct ctdb_ltdb_header *hdr;
789
790         if (tdb_chainlock(lock->ctdb_db->tdb, lock->key) != 0) {
791                 DEBUG(lock->ctdb_db->ctdb, LOG_WARNING,
792                       "ctdb_readrecordlock_async: failed to chainlock");
793                 return NULL;
794         }
795
796         hdr = ctdb_local_fetch(lock->ctdb_db->tdb, lock->key, data);
797         if (hdr && lock->readonly && (hdr->flags & CTDB_REC_RO_HAVE_READONLY) ) {
798                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
799                       "ctdb_readrecordlock_async: got local lock for ro");
800                 lock->held_magic = lock_magic(lock);
801                 lock->hdr = hdr;
802                 add_lock(lock->ctdb_db->ctdb, lock);
803                 return true;
804         }
805         if (hdr && hdr->dmaster == lock->ctdb_db->ctdb->pnn) {
806                 DEBUG(lock->ctdb_db->ctdb, LOG_DEBUG,
807                       "ctdb_readrecordlock_async: got local lock");
808                 lock->held_magic = lock_magic(lock);
809                 lock->hdr = hdr;
810                 add_lock(lock->ctdb_db->ctdb, lock);
811                 return true;
812         }
813
814         /* we dont have the record locally,
815          * drop to writelock to force a migration
816          */
817         if (!hdr && lock->readonly) {
818                 lock->readonly = false;
819         }
820
821         tdb_chainunlock(lock->ctdb_db->tdb, lock->key);
822         free(hdr);
823         return NULL;
824 }
825
826 /* If they shutdown before we hand them the lock, we free it here. */
827 static void destroy_lock(struct ctdb_connection *ctdb,
828                          struct ctdb_request *req)
829 {
830         free_lock(req->extra);
831 }
832
833 static void readrecordlock_retry(struct ctdb_connection *ctdb,
834                                  struct ctdb_request *req, void *private)
835 {
836         struct ctdb_lock *lock = req->extra;
837         struct ctdb_reply_call *reply;
838         TDB_DATA data;
839
840         /* OK, we've received reply to fetch migration */
841         reply = unpack_reply_call(req, CTDB_FETCH_FUNC);
842         if (!reply || reply->status != 0) {
843                 if (reply) {
844                         DEBUG(ctdb, LOG_ERR,
845                               "ctdb_readrecordlock_async(async):"
846                               " FETCH returned %i", reply->status);
847                 }
848                 lock->callback(lock->ctdb_db, NULL, tdb_null, private);
849                 ctdb_request_free(req); /* Also frees lock. */
850                 return;
851         }
852
853         /* Can we get lock now? */
854         if (try_readrecordlock(lock, &data)) {
855                 /* Now it's their responsibility to free lock & request! */
856                 req->extra_destructor = NULL;
857                 lock->callback(lock->ctdb_db, lock, data, private);
858                 ctdb_request_free(req);
859                 return;
860         }
861
862         /* Retransmit the same request again (we lost race). */
863         io_elem_reset(req->io);
864         DLIST_ADD(ctdb->outq, req);
865 }
866
867 static bool
868 ctdb_readrecordlock_internal(struct ctdb_db *ctdb_db, TDB_DATA key,
869                              bool readonly,
870                              ctdb_rrl_callback_t callback, void *cbdata)
871 {
872         struct ctdb_request *req;
873         struct ctdb_lock *lock;
874         TDB_DATA data;
875
876         if (holding_lock(ctdb_db->ctdb)) {
877                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
878                       "ctdb_readrecordlock_async: already holding lock");
879                 return false;
880         }
881
882         /* Setup lock */
883         lock = malloc(sizeof(*lock) + key.dsize);
884         if (!lock) {
885                 DEBUG(ctdb_db->ctdb, LOG_ERR,
886                       "ctdb_readrecordlock_async: lock allocation failed");
887                 return false;
888         }
889         lock->key.dptr = (void *)(lock + 1);
890         memcpy(lock->key.dptr, key.dptr, key.dsize);
891         lock->key.dsize = key.dsize;
892         lock->ctdb_db = ctdb_db;
893         lock->hdr = NULL;
894         lock->held_magic = 0;
895         lock->readonly = readonly;
896
897         /* Fast path. */
898         if (try_readrecordlock(lock, &data)) {
899                 callback(ctdb_db, lock, data, cbdata);
900                 return true;
901         }
902
903         /* Slow path: create request. */
904         req = new_ctdb_request(
905                 ctdb_db->ctdb,
906                 offsetof(struct ctdb_req_call, data) + key.dsize,
907                 readrecordlock_retry, cbdata);
908         if (!req) {
909                 DEBUG(ctdb_db->ctdb, LOG_ERR,
910                       "ctdb_readrecordlock_async: allocation failed");
911                 free_lock(lock);
912                 return NULL;
913         }
914         req->extra = lock;
915         req->extra_destructor = destroy_lock;
916         /* We store the original callback in the lock, and use our own. */
917         lock->callback = callback;
918
919         io_elem_init_req_header(req->io, CTDB_REQ_CALL, CTDB_CURRENT_NODE,
920                                 new_reqid(ctdb_db->ctdb));
921
922         if (lock->readonly) {
923                 req->hdr.call->flags = CTDB_WANT_READONLY;
924         } else {
925                 req->hdr.call->flags = CTDB_IMMEDIATE_MIGRATION;
926         }
927         req->hdr.call->db_id = ctdb_db->id;
928         req->hdr.call->callid = CTDB_FETCH_FUNC;
929         req->hdr.call->hopcount = 0;
930         req->hdr.call->keylen = key.dsize;
931         req->hdr.call->calldatalen = 0;
932         memcpy(req->hdr.call->data, key.dptr, key.dsize);
933         DLIST_ADD(ctdb_db->ctdb->outq, req);
934         return true;
935 }
936
937 bool
938 ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
939                           ctdb_rrl_callback_t callback, void *cbdata)
940 {
941         return ctdb_readrecordlock_internal(ctdb_db, key,
942                         false,
943                         callback, cbdata);
944 }
945
946 bool
947 ctdb_readonlyrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
948                           ctdb_rrl_callback_t callback, void *cbdata)
949 {
950         return ctdb_readrecordlock_internal(ctdb_db, key,
951                         true,
952                         callback, cbdata);
953 }
954
955 bool ctdb_writerecord(struct ctdb_db *ctdb_db,
956                       struct ctdb_lock *lock, TDB_DATA data)
957 {
958         if (lock->readonly) {
959                 errno = EBADF;
960                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
961                       "ctdb_writerecord: Can not write, read-only record.");
962                 return false;
963         }
964
965         if (lock->ctdb_db != ctdb_db) {
966                 errno = EBADF;
967                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
968                       "ctdb_writerecord: Can not write, wrong ctdb_db.");
969                 return false;
970         }
971
972         if (lock->held_magic != lock_magic(lock)) {
973                 errno = EBADF;
974                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
975                       "ctdb_writerecord: Can not write. Lock has been released.");
976                 return false;
977         }
978                 
979         if (ctdb_db->persistent) {
980                 errno = EINVAL;
981                 DEBUG(ctdb_db->ctdb, LOG_ALERT,
982                       "ctdb_writerecord: cannot write to persistent db");
983                 return false;
984         }
985
986         switch (ctdb_local_store(ctdb_db->tdb, lock->key, lock->hdr, data)) {
987         case 0:
988                 DEBUG(ctdb_db->ctdb, LOG_DEBUG,
989                       "ctdb_writerecord: optimized away noop write.");
990                 /* fall thru */
991         case 1:
992                 return true;
993
994         default:
995                 switch (errno) {
996                 case ENOMEM:
997                         DEBUG(ctdb_db->ctdb, LOG_CRIT,
998                               "ctdb_writerecord: out of memory.");
999                         break;
1000                 case EINVAL:
1001                         DEBUG(ctdb_db->ctdb, LOG_ALERT,
1002                               "ctdb_writerecord: record changed under lock?");
1003                         break;
1004                 default: /* TDB already logged. */
1005                         break;
1006                 }
1007                 return false;
1008         }
1009 }
1010
1011
1012 struct ctdb_traverse_state {
1013         struct ctdb_request *handle;
1014         struct ctdb_db *ctdb_db;
1015         uint64_t srvid;
1016
1017         ctdb_traverse_callback_t callback;
1018         void *cbdata;
1019 };
1020
1021 static void traverse_remhnd_cb(struct ctdb_connection *ctdb,
1022                         struct ctdb_request *req, void *private_data)
1023 {
1024         struct ctdb_traverse_state *state = private_data;
1025
1026         if (!ctdb_remove_message_handler_recv(ctdb, state->handle)) {
1027                 DEBUG(ctdb, LOG_ERR,
1028                                 "Failed to remove message handler for"
1029                                 " traverse.");
1030                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1031                                 TRAVERSE_STATUS_ERROR,
1032                                 tdb_null, tdb_null,
1033                                 state->cbdata);
1034         }
1035         ctdb_request_free(state->handle);
1036         state->handle = NULL;
1037         free(state);
1038 }
1039         
1040 static void msg_h(struct ctdb_connection *ctdb, uint64_t srvid,
1041            TDB_DATA data, void *private_data)
1042 {
1043         struct ctdb_traverse_state *state = private_data;
1044         struct ctdb_db *ctdb_db = state->ctdb_db;
1045         struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
1046         TDB_DATA key;
1047
1048         if (data.dsize < sizeof(uint32_t) ||
1049             d->length != data.dsize) {
1050                 DEBUG(ctdb, LOG_ERR,
1051                         "Bad data size %u in traverse_handler",
1052                         (unsigned)data.dsize);
1053                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1054                                 TRAVERSE_STATUS_ERROR,
1055                                 tdb_null, tdb_null,
1056                                 state->cbdata);
1057                 state->handle = ctdb_remove_message_handler_send(
1058                                 state->ctdb_db->ctdb, state->srvid,
1059                                 msg_h, state,
1060                                 traverse_remhnd_cb, state);
1061                 return;
1062         }
1063
1064         key.dsize = d->keylen;
1065         key.dptr  = &d->data[0];
1066         data.dsize = d->datalen;
1067         data.dptr = &d->data[d->keylen];
1068
1069         if (key.dsize == 0 && data.dsize == 0) {
1070                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1071                                 TRAVERSE_STATUS_FINISHED,
1072                                 tdb_null, tdb_null,
1073                                 state->cbdata);
1074                 state->handle = ctdb_remove_message_handler_send(
1075                                 state->ctdb_db->ctdb, state->srvid,
1076                                 msg_h, state,
1077                                 traverse_remhnd_cb, state);
1078                 return;
1079         }
1080
1081         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1082                 /* empty records are deleted records in ctdb */
1083                 return;
1084         }
1085
1086         data.dsize -= sizeof(struct ctdb_ltdb_header);
1087         data.dptr  += sizeof(struct ctdb_ltdb_header);
1088
1089         if (state->callback(ctdb, ctdb_db,
1090                         TRAVERSE_STATUS_RECORD,
1091                         key, data, state->cbdata) != 0) {
1092                 state->handle = ctdb_remove_message_handler_send(
1093                                 state->ctdb_db->ctdb, state->srvid,
1094                                 msg_h, state,
1095                                 traverse_remhnd_cb, state);
1096                 return;
1097         }
1098 }
1099
1100 static void traverse_start_cb(struct ctdb_connection *ctdb,
1101                         struct ctdb_request *req, void *private_data)
1102 {
1103         struct ctdb_traverse_state *state = private_data;
1104
1105         ctdb_request_free(state->handle);
1106         state->handle = NULL;
1107 }
1108
1109 static void traverse_msghnd_cb(struct ctdb_connection *ctdb,
1110                         struct ctdb_request *req, void *private_data)
1111 {
1112         struct ctdb_traverse_state *state = private_data;
1113         struct ctdb_db *ctdb_db = state->ctdb_db;
1114         struct ctdb_traverse_start t;
1115
1116         if (!ctdb_set_message_handler_recv(ctdb, state->handle)) {
1117                 DEBUG(ctdb, LOG_ERR,
1118                                 "Failed to register message handler for"
1119                                 " traverse.");
1120                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1121                                 TRAVERSE_STATUS_ERROR,
1122                                 tdb_null, tdb_null,
1123                                 state->cbdata);
1124                 ctdb_request_free(state->handle);
1125                 state->handle = NULL;
1126                 free(state);
1127                 return;
1128         }
1129         ctdb_request_free(state->handle);
1130         state->handle = NULL;
1131
1132         t.db_id = ctdb_db->id;
1133         t.srvid = state->srvid;
1134         t.reqid = 0;
1135
1136         state->handle = new_ctdb_control_request(ctdb,
1137                                 CTDB_CONTROL_TRAVERSE_START,
1138                                 CTDB_CURRENT_NODE,
1139                                 &t, sizeof(t),
1140                                 traverse_start_cb, state);
1141         if (state->handle == NULL) {
1142                 DEBUG(ctdb, LOG_ERR,
1143                                 "ctdb_traverse_async:"
1144                                 " failed to send traverse_start control");
1145                 state->callback(state->ctdb_db->ctdb, state->ctdb_db,
1146                                 TRAVERSE_STATUS_ERROR,
1147                                 tdb_null, tdb_null,
1148                                 state->cbdata);
1149                 state->handle = ctdb_remove_message_handler_send(
1150                                 state->ctdb_db->ctdb, state->srvid,
1151                                 msg_h, state,
1152                                 traverse_remhnd_cb, state);
1153                 return;
1154         }
1155 }
1156
1157 bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
1158                          ctdb_traverse_callback_t callback, void *cbdata)
1159 {
1160         struct ctdb_connection *ctdb = ctdb_db->ctdb;
1161         struct ctdb_traverse_state *state;
1162         static uint32_t tid = 0;
1163
1164         state = malloc(sizeof(struct ctdb_traverse_state));
1165         if (state == NULL) {
1166                 DEBUG(ctdb, LOG_ERR,
1167                                 "ctdb_traverse_async: no memory."
1168                                 " allocate state failed");
1169                 return false;
1170         }
1171
1172         tid++;
1173         state->srvid = CTDB_SRVID_TRAVERSE_RANGE|tid;
1174
1175         state->callback = callback;
1176         state->cbdata   = cbdata;
1177         state->ctdb_db  = ctdb_db;
1178
1179         state->handle = ctdb_set_message_handler_send(ctdb_db->ctdb,
1180                                 state->srvid,
1181                                 msg_h, state,
1182                                 traverse_msghnd_cb, state);
1183         if (state->handle == NULL) {
1184                 DEBUG(ctdb, LOG_ERR,
1185                         "ctdb_traverse_async:"
1186                         " failed ctdb_set_message_handler_send");
1187                 free(state);
1188                 return false;
1189         }
1190
1191         return true;
1192 }
1193
1194 int ctdb_num_out_queue(struct ctdb_connection *ctdb)
1195 {
1196         struct ctdb_request *req;
1197         int i;
1198
1199         for (i = 0, req = ctdb->outq; req; req = req->next, i++)
1200                 ;
1201
1202         return i;
1203 }
1204
1205 int ctdb_num_in_flight(struct ctdb_connection *ctdb)
1206 {
1207         struct ctdb_request *req;
1208         int i;
1209
1210         for (i = 0, req = ctdb->doneq; req; req = req->next, i++)
1211                 ;
1212
1213         return i;
1214 }
1215
1216 int ctdb_num_active(struct ctdb_connection *ctdb)
1217 {
1218         return ctdb_num_out_queue(ctdb)
1219                  + ctdb_num_in_flight(ctdb);
1220 }
1221