56c97afe4331bb1db9a72f3c0df9724ebaf2bcbd
[ddiss/samba.git] / lib / ntdb / open.c
1  /*
2    Trivial Database 2: opening and closing TDBs
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/build_assert/build_assert.h>
20
21 /* all tdbs, to detect double-opens (fcntl file don't nest!) */
22 static struct ntdb_context *tdbs = NULL;
23
24 static struct ntdb_file *find_file(dev_t device, ino_t ino)
25 {
26         struct ntdb_context *i;
27
28         for (i = tdbs; i; i = i->next) {
29                 if (i->file->device == device && i->file->inode == ino) {
30                         i->file->refcnt++;
31                         return i->file;
32                 }
33         }
34         return NULL;
35 }
36
37 static bool read_all(int fd, void *buf, size_t len)
38 {
39         while (len) {
40                 ssize_t ret;
41                 ret = read(fd, buf, len);
42                 if (ret < 0)
43                         return false;
44                 if (ret == 0) {
45                         /* ETOOSHORT? */
46                         errno = EWOULDBLOCK;
47                         return false;
48                 }
49                 buf = (char *)buf + ret;
50                 len -= ret;
51         }
52         return true;
53 }
54
55 static uint32_t random_number(struct ntdb_context *ntdb)
56 {
57         int fd;
58         uint32_t ret = 0;
59         struct timeval now;
60
61         fd = open("/dev/urandom", O_RDONLY);
62         if (fd >= 0) {
63                 if (read_all(fd, &ret, sizeof(ret))) {
64                         close(fd);
65                         return ret;
66                 }
67                 close(fd);
68         }
69         /* FIXME: Untested!  Based on Wikipedia protocol description! */
70         fd = open("/dev/egd-pool", O_RDWR);
71         if (fd >= 0) {
72                 /* Command is 1, next byte is size we want to read. */
73                 char cmd[2] = { 1, sizeof(uint32_t) };
74                 if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
75                         char reply[1 + sizeof(uint32_t)];
76                         int r = read(fd, reply, sizeof(reply));
77                         if (r > 1) {
78                                 /* Copy at least some bytes. */
79                                 memcpy(&ret, reply+1, r - 1);
80                                 if (reply[0] == sizeof(uint32_t)
81                                     && r == sizeof(reply)) {
82                                         close(fd);
83                                         return ret;
84                                 }
85                         }
86                 }
87                 close(fd);
88         }
89
90         /* Fallback: pid and time. */
91         gettimeofday(&now, NULL);
92         ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
93         ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
94                    "ntdb_open: random from getpid and time");
95         return ret;
96 }
97
98 static void ntdb_context_init(struct ntdb_context *ntdb)
99 {
100         /* Initialize the NTDB fields here */
101         ntdb_io_init(ntdb);
102         ntdb->direct_access = 0;
103         ntdb->transaction = NULL;
104         ntdb->access = NULL;
105 }
106
107 /* initialise a new database:
108  *
109  *      struct ntdb_header;
110  *      struct {
111  *              struct ntdb_used_record hash_header;
112  *              ntdb_off_t hash_buckets[1 << ntdb->hash_bits];
113  *      } hash;
114  *      struct ntdb_freetable ftable;
115  *      struct {
116  *              struct ntdb_free_record free_header;
117  *              char forty_three[...];
118  *      } remainder;
119  */
120 #define NEW_DATABASE_HDR_SIZE(hbits)                                    \
121         (sizeof(struct ntdb_header)                                     \
122          + sizeof(struct ntdb_used_record) + (sizeof(ntdb_off_t) << hbits) \
123          + sizeof(struct ntdb_freetable)                                \
124          + sizeof(struct ntdb_free_record))
125
126 static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
127                                          struct ntdb_attribute_seed *seed,
128                                          struct ntdb_header *rhdr)
129 {
130         /* We make it up in memory, then write it out if not internal */
131         struct ntdb_freetable *ftable;
132         struct ntdb_used_record *htable;
133         struct ntdb_header *hdr;
134         struct ntdb_free_record *remainder;
135         char *mem;
136         unsigned int magic_len;
137         ssize_t rlen;
138         size_t dbsize, hashsize, hdrsize, remaindersize;
139         enum NTDB_ERROR ecode;
140
141         hashsize = sizeof(ntdb_off_t) << ntdb->hash_bits;
142
143         /* Always make db a multiple of NTDB_PGSIZE */
144         hdrsize = NEW_DATABASE_HDR_SIZE(ntdb->hash_bits);
145         dbsize = (hdrsize + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
146
147         mem = ntdb->alloc_fn(ntdb, dbsize, ntdb->alloc_data);
148         if (!mem) {
149                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
150                                    "ntdb_new_database: failed to allocate");
151         }
152
153         hdr = (void *)mem;
154         htable = (void *)(mem + sizeof(*hdr));
155         ftable = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize);
156         remainder = (void *)(mem + sizeof(*hdr) + sizeof(*htable) + hashsize
157                              + sizeof(*ftable));
158
159         /* Fill in the header */
160         hdr->version = NTDB_VERSION;
161         if (seed)
162                 hdr->hash_seed = seed->seed;
163         else
164                 hdr->hash_seed = random_number(ntdb);
165         hdr->hash_test = NTDB_HASH_MAGIC;
166         hdr->hash_test = ntdb->hash_fn(&hdr->hash_test,
167                                        sizeof(hdr->hash_test),
168                                        hdr->hash_seed,
169                                        ntdb->hash_data);
170         hdr->hash_bits = ntdb->hash_bits;
171         hdr->recovery = 0;
172         hdr->features_used = hdr->features_offered = NTDB_FEATURE_MASK;
173         hdr->seqnum = 0;
174         hdr->capabilities = 0;
175         memset(hdr->reserved, 0, sizeof(hdr->reserved));
176
177         /* Hash is all zero after header. */
178         set_header(NULL, htable, NTDB_HTABLE_MAGIC, 0, hashsize, hashsize);
179         memset(htable + 1, 0, hashsize);
180
181         /* Free is empty. */
182         hdr->free_table = (char *)ftable - (char *)hdr;
183         memset(ftable, 0, sizeof(*ftable));
184         ecode = set_header(NULL, &ftable->hdr, NTDB_FTABLE_MAGIC, 0,
185                            sizeof(*ftable) - sizeof(ftable->hdr),
186                            sizeof(*ftable) - sizeof(ftable->hdr));
187         if (ecode != NTDB_SUCCESS) {
188                 goto out;
189         }
190
191         /* Rest of database is a free record, containing junk. */
192         remaindersize = dbsize - hdrsize;
193         remainder->ftable_and_len
194                 = (remaindersize + sizeof(*remainder)
195                    - sizeof(struct ntdb_used_record));
196         remainder->next = 0;
197         remainder->magic_and_prev
198                 = (NTDB_FREE_MAGIC << (64-NTDB_OFF_UPPER_STEAL))
199                 | ((char *)remainder - (char *)hdr);
200         memset(remainder + 1, 0x43, remaindersize);
201
202         /* Put in our single free entry. */
203         ftable->buckets[size_to_bucket(remaindersize)] =
204                 (char *)remainder - (char *)hdr;
205
206         /* Magic food */
207         memset(hdr->magic_food, 0, sizeof(hdr->magic_food));
208         strcpy(hdr->magic_food, NTDB_MAGIC_FOOD);
209
210         /* This creates an endian-converted database, as if read from disk */
211         magic_len = sizeof(hdr->magic_food);
212         ntdb_convert(ntdb, (char *)hdr + magic_len, hdrsize - magic_len);
213
214         /* Return copy of header. */
215         *rhdr = *hdr;
216
217         if (ntdb->flags & NTDB_INTERNAL) {
218                 ntdb->file->map_size = dbsize;
219                 ntdb->file->map_ptr = hdr;
220                 return NTDB_SUCCESS;
221         }
222         if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
223                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
224                                     "ntdb_new_database:"
225                                     " failed to seek: %s", strerror(errno));
226                 goto out;
227         }
228
229         if (ftruncate(ntdb->file->fd, 0) == -1) {
230                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
231                                     "ntdb_new_database:"
232                                     " failed to truncate: %s", strerror(errno));
233                 goto out;
234         }
235
236         rlen = write(ntdb->file->fd, hdr, dbsize);
237         if (rlen != dbsize) {
238                 if (rlen >= 0)
239                         errno = ENOSPC;
240                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
241                                     "ntdb_new_database: %zi writing header: %s",
242                                     rlen, strerror(errno));
243                 goto out;
244         }
245
246 out:
247         ntdb->free_fn(hdr, ntdb->alloc_data);
248         return ecode;
249 }
250
251 static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
252 {
253         ntdb->file = ntdb->alloc_fn(NULL, sizeof(*ntdb->file), ntdb->alloc_data);
254         if (!ntdb->file)
255                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
256                                   "ntdb_open: cannot alloc ntdb_file structure");
257         ntdb->file->num_lockrecs = 0;
258         ntdb->file->lockrecs = NULL;
259         ntdb->file->allrecord_lock.count = 0;
260         ntdb->file->refcnt = 1;
261         ntdb->file->map_ptr = NULL;
262         return NTDB_SUCCESS;
263 }
264
265 _PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
266                                  const union ntdb_attribute *attr)
267 {
268         switch (attr->base.attr) {
269         case NTDB_ATTRIBUTE_LOG:
270                 ntdb->log_fn = attr->log.fn;
271                 ntdb->log_data = attr->log.data;
272                 break;
273         case NTDB_ATTRIBUTE_HASH:
274         case NTDB_ATTRIBUTE_SEED:
275         case NTDB_ATTRIBUTE_OPENHOOK:
276         case NTDB_ATTRIBUTE_HASHSIZE:
277                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
278                                    NTDB_LOG_USE_ERROR,
279                                    "ntdb_set_attribute:"
280                                    " cannot set %s after opening",
281                                    attr->base.attr == NTDB_ATTRIBUTE_HASH
282                                    ? "NTDB_ATTRIBUTE_HASH"
283                                    : attr->base.attr == NTDB_ATTRIBUTE_SEED
284                                    ? "NTDB_ATTRIBUTE_SEED"
285                                    : attr->base.attr == NTDB_ATTRIBUTE_OPENHOOK
286                                    ? "NTDB_ATTRIBUTE_OPENHOOK"
287                                    : "NTDB_ATTRIBUTE_HASHSIZE");
288         case NTDB_ATTRIBUTE_STATS:
289                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
290                                    NTDB_LOG_USE_ERROR,
291                                    "ntdb_set_attribute:"
292                                    " cannot set NTDB_ATTRIBUTE_STATS");
293         case NTDB_ATTRIBUTE_FLOCK:
294                 ntdb->lock_fn = attr->flock.lock;
295                 ntdb->unlock_fn = attr->flock.unlock;
296                 ntdb->lock_data = attr->flock.data;
297                 break;
298         case NTDB_ATTRIBUTE_ALLOCATOR:
299                 ntdb->alloc_fn = attr->alloc.alloc;
300                 ntdb->expand_fn = attr->alloc.expand;
301                 ntdb->free_fn = attr->alloc.free;
302                 ntdb->alloc_data = attr->alloc.priv_data;
303                 break;
304         default:
305                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
306                                    NTDB_LOG_USE_ERROR,
307                                    "ntdb_set_attribute:"
308                                    " unknown attribute type %u",
309                                    attr->base.attr);
310         }
311         return NTDB_SUCCESS;
312 }
313
314 _PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
315                                  union ntdb_attribute *attr)
316 {
317         switch (attr->base.attr) {
318         case NTDB_ATTRIBUTE_LOG:
319                 if (!ntdb->log_fn)
320                         return NTDB_ERR_NOEXIST;
321                 attr->log.fn = ntdb->log_fn;
322                 attr->log.data = ntdb->log_data;
323                 break;
324         case NTDB_ATTRIBUTE_HASH:
325                 attr->hash.fn = ntdb->hash_fn;
326                 attr->hash.data = ntdb->hash_data;
327                 break;
328         case NTDB_ATTRIBUTE_SEED:
329                 attr->seed.seed = ntdb->hash_seed;
330                 break;
331         case NTDB_ATTRIBUTE_OPENHOOK:
332                 if (!ntdb->openhook)
333                         return NTDB_ERR_NOEXIST;
334                 attr->openhook.fn = ntdb->openhook;
335                 attr->openhook.data = ntdb->openhook_data;
336                 break;
337         case NTDB_ATTRIBUTE_STATS: {
338                 size_t size = attr->stats.size;
339                 if (size > ntdb->stats.size)
340                         size = ntdb->stats.size;
341                 memcpy(&attr->stats, &ntdb->stats, size);
342                 break;
343         }
344         case NTDB_ATTRIBUTE_FLOCK:
345                 attr->flock.lock = ntdb->lock_fn;
346                 attr->flock.unlock = ntdb->unlock_fn;
347                 attr->flock.data = ntdb->lock_data;
348                 break;
349         case NTDB_ATTRIBUTE_ALLOCATOR:
350                 attr->alloc.alloc = ntdb->alloc_fn;
351                 attr->alloc.expand = ntdb->expand_fn;
352                 attr->alloc.free = ntdb->free_fn;
353                 attr->alloc.priv_data = ntdb->alloc_data;
354                 break;
355         case NTDB_ATTRIBUTE_HASHSIZE:
356                 attr->hashsize.size = 1 << ntdb->hash_bits;
357                 break;
358         default:
359                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
360                                    NTDB_LOG_USE_ERROR,
361                                    "ntdb_get_attribute:"
362                                    " unknown attribute type %u",
363                                    attr->base.attr);
364         }
365         attr->base.next = NULL;
366         return NTDB_SUCCESS;
367 }
368
369 _PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
370                          enum ntdb_attribute_type type)
371 {
372         switch (type) {
373         case NTDB_ATTRIBUTE_LOG:
374                 ntdb->log_fn = NULL;
375                 break;
376         case NTDB_ATTRIBUTE_OPENHOOK:
377                 ntdb->openhook = NULL;
378                 break;
379         case NTDB_ATTRIBUTE_HASH:
380         case NTDB_ATTRIBUTE_SEED:
381                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
382                            "ntdb_unset_attribute: cannot unset %s after opening",
383                            type == NTDB_ATTRIBUTE_HASH
384                            ? "NTDB_ATTRIBUTE_HASH"
385                            : "NTDB_ATTRIBUTE_SEED");
386                 break;
387         case NTDB_ATTRIBUTE_STATS:
388                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
389                            NTDB_LOG_USE_ERROR,
390                            "ntdb_unset_attribute:"
391                            "cannot unset NTDB_ATTRIBUTE_STATS");
392                 break;
393         case NTDB_ATTRIBUTE_FLOCK:
394                 ntdb->lock_fn = ntdb_fcntl_lock;
395                 ntdb->unlock_fn = ntdb_fcntl_unlock;
396                 break;
397         default:
398                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
399                            NTDB_LOG_USE_ERROR,
400                            "ntdb_unset_attribute: unknown attribute type %u",
401                            type);
402         }
403 }
404
405 /* The top three bits of the capability tell us whether it matters. */
406 enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
407                                   ntdb_off_t type)
408 {
409         if (type & NTDB_CAP_NOOPEN) {
410                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
411                                   "%s: file has unknown capability %llu",
412                                   caller, type & NTDB_CAP_NOOPEN);
413         }
414
415         if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
416                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
417                                   "%s: file has unknown capability %llu"
418                                   " (cannot write to it)",
419                                   caller, type & NTDB_CAP_NOOPEN);
420         }
421
422         if (type & NTDB_CAP_NOCHECK) {
423                 ntdb->flags |= NTDB_CANT_CHECK;
424         }
425         return NTDB_SUCCESS;
426 }
427
428 static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
429                                       ntdb_off_t capabilities)
430 {
431         ntdb_off_t off, next;
432         enum NTDB_ERROR ecode = NTDB_SUCCESS;
433         const struct ntdb_capability *cap;
434
435         /* Check capability list. */
436         for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
437                 cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
438                 if (NTDB_PTR_IS_ERR(cap)) {
439                         return NTDB_PTR_ERR(cap);
440                 }
441
442                 switch (cap->type & NTDB_CAP_TYPE_MASK) {
443                 /* We don't understand any capabilities (yet). */
444                 default:
445                         ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
446                 }
447                 next = cap->next;
448                 ntdb_access_release(ntdb, cap);
449         }
450         return ecode;
451 }
452
453 static void *default_alloc(const void *owner, size_t len, void *priv_data)
454 {
455         return malloc(len);
456 }
457
458 static void *default_expand(void *ptr, size_t len, void *priv_data)
459 {
460         return realloc(ptr, len);
461 }
462
463 static void default_free(void *ptr, void *priv_data)
464 {
465         free(ptr);
466 }
467
468 /* First allocation needs manual search of attributes. */
469 static struct ntdb_context *alloc_ntdb(const union ntdb_attribute *attr,
470                                        const char *name)
471 {
472         size_t len = sizeof(struct ntdb_context) + strlen(name) + 1;
473
474         while (attr) {
475                 if  (attr->base.attr == NTDB_ATTRIBUTE_ALLOCATOR) {
476                         return attr->alloc.alloc(NULL, len,
477                                                  attr->alloc.priv_data);
478                 }
479                 attr = attr->base.next;
480         }
481         return default_alloc(NULL, len, NULL);
482 }
483
484 static unsigned int next_pow2(uint64_t size)
485 {
486         unsigned int bits = 1;
487
488         while ((1ULL << bits) < size)
489                 bits++;
490         return bits;
491 }
492
493 _PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
494                                         int open_flags, mode_t mode,
495                                         union ntdb_attribute *attr)
496 {
497         struct ntdb_context *ntdb;
498         struct stat st;
499         int saved_errno = 0;
500         uint64_t hash_test;
501         unsigned v;
502         ssize_t rlen;
503         struct ntdb_header hdr;
504         struct ntdb_attribute_seed *seed = NULL;
505         ntdb_bool_err berr;
506         enum NTDB_ERROR ecode;
507         int openlock;
508
509         ntdb = alloc_ntdb(attr, name);
510         if (!ntdb) {
511                 /* Can't log this */
512                 errno = ENOMEM;
513                 return NULL;
514         }
515         /* Set name immediately for logging functions. */
516         ntdb->name = strcpy((char *)(ntdb + 1), name);
517         ntdb->flags = ntdb_flags;
518         ntdb->log_fn = NULL;
519         ntdb->open_flags = open_flags;
520         ntdb->file = NULL;
521         ntdb->openhook = NULL;
522         ntdb->lock_fn = ntdb_fcntl_lock;
523         ntdb->unlock_fn = ntdb_fcntl_unlock;
524         ntdb->hash_fn = ntdb_jenkins_hash;
525         memset(&ntdb->stats, 0, sizeof(ntdb->stats));
526         ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
527         ntdb->stats.size = sizeof(ntdb->stats);
528         ntdb->alloc_fn = default_alloc;
529         ntdb->expand_fn = default_expand;
530         ntdb->free_fn = default_free;
531         ntdb->hash_bits = NTDB_DEFAULT_HBITS; /* 64k of hash by default. */
532
533         while (attr) {
534                 switch (attr->base.attr) {
535                 case NTDB_ATTRIBUTE_HASH:
536                         ntdb->hash_fn = attr->hash.fn;
537                         ntdb->hash_data = attr->hash.data;
538                         break;
539                 case NTDB_ATTRIBUTE_SEED:
540                         seed = &attr->seed;
541                         break;
542                 case NTDB_ATTRIBUTE_OPENHOOK:
543                         ntdb->openhook = attr->openhook.fn;
544                         ntdb->openhook_data = attr->openhook.data;
545                         break;
546                 case NTDB_ATTRIBUTE_HASHSIZE:
547                         ntdb->hash_bits = next_pow2(attr->hashsize.size);
548                         if (ntdb->hash_bits > 31) {
549                                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
550                                                     NTDB_LOG_USE_ERROR,
551                                                     "ntdb_open: hash_size %u"
552                                                     " too large",
553                                                     attr->hashsize.size);
554                                 goto fail;
555                         }
556                         break;
557                 default:
558                         /* These are set as normal. */
559                         ecode = ntdb_set_attribute(ntdb, attr);
560                         if (ecode != NTDB_SUCCESS)
561                                 goto fail;
562                 }
563                 attr = attr->base.next;
564         }
565
566         if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
567                           | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
568                           | NTDB_RDONLY)) {
569                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
570                                    "ntdb_open: unknown flags %u", ntdb_flags);
571                 goto fail;
572         }
573
574         if (seed) {
575                 if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
576                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
577                                            NTDB_LOG_USE_ERROR,
578                                            "ntdb_open:"
579                                            " cannot set NTDB_ATTRIBUTE_SEED"
580                                            " without O_CREAT.");
581                         goto fail;
582                 }
583         }
584
585         if ((open_flags & O_ACCMODE) == O_WRONLY) {
586                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
587                                    "ntdb_open: can't open ntdb %s write-only",
588                                    name);
589                 goto fail;
590         }
591
592         if ((open_flags & O_ACCMODE) == O_RDONLY) {
593                 openlock = F_RDLCK;
594                 ntdb->flags |= NTDB_RDONLY;
595         } else {
596                 if (ntdb_flags & NTDB_RDONLY) {
597                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
598                                            NTDB_LOG_USE_ERROR,
599                                            "ntdb_open: can't use NTDB_RDONLY"
600                                            " without O_RDONLY");
601                         goto fail;
602                 }
603                 openlock = F_WRLCK;
604         }
605
606         /* internal databases don't need any of the rest. */
607         if (ntdb->flags & NTDB_INTERNAL) {
608                 ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
609                 ecode = ntdb_new_file(ntdb);
610                 if (ecode != NTDB_SUCCESS) {
611                         goto fail;
612                 }
613                 ntdb->file->fd = -1;
614                 ecode = ntdb_new_database(ntdb, seed, &hdr);
615                 if (ecode == NTDB_SUCCESS) {
616                         ntdb_convert(ntdb, &hdr.hash_seed,
617                                     sizeof(hdr.hash_seed));
618                         ntdb->hash_seed = hdr.hash_seed;
619                         ntdb_context_init(ntdb);
620                         ntdb_ftable_init(ntdb);
621                 }
622                 if (ecode != NTDB_SUCCESS) {
623                         goto fail;
624                 }
625                 return ntdb;
626         }
627
628         if (stat(name, &st) != -1)
629                 ntdb->file = find_file(st.st_dev, st.st_ino);
630
631         if (!ntdb->file) {
632                 ecode = ntdb_new_file(ntdb);
633                 if (ecode != NTDB_SUCCESS) {
634                         goto fail;
635                 }
636
637                 /* Set this now, as ntdb_nest_lock examines it. */
638                 ntdb->file->map_size = 0;
639
640                 if ((ntdb->file->fd = open(name, open_flags, mode)) == -1) {
641                         enum ntdb_log_level lvl;
642                         /* errno set by open(2) */
643                         saved_errno = errno;
644
645                         /* Probing for files like this is a common pattern. */
646                         if (!(open_flags & O_CREAT) && errno == ENOENT) {
647                                 lvl = NTDB_LOG_WARNING;
648                         } else {
649                                 lvl = NTDB_LOG_ERROR;
650                         }
651                         ntdb_logerr(ntdb, NTDB_ERR_IO, lvl,
652                                    "ntdb_open: could not open file %s: %s",
653                                    name, strerror(errno));
654
655                         goto fail_errno;
656                 }
657
658                 /* ensure there is only one process initialising at once:
659                  * do it immediately to reduce the create/openlock race. */
660                 ecode = ntdb_lock_open(ntdb, openlock,
661                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
662                 if (ecode != NTDB_SUCCESS) {
663                         saved_errno = errno;
664                         goto fail_errno;
665                 }
666
667                 /* on exec, don't inherit the fd */
668                 v = fcntl(ntdb->file->fd, F_GETFD, 0);
669                 fcntl(ntdb->file->fd, F_SETFD, v | FD_CLOEXEC);
670
671                 if (fstat(ntdb->file->fd, &st) == -1) {
672                         saved_errno = errno;
673                         ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
674                                    "ntdb_open: could not stat open %s: %s",
675                                    name, strerror(errno));
676                         goto fail_errno;
677                 }
678
679                 ntdb->file->device = st.st_dev;
680                 ntdb->file->inode = st.st_ino;
681         } else {
682                 /* ensure there is only one process initialising at once */
683                 ecode = ntdb_lock_open(ntdb, openlock,
684                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
685                 if (ecode != NTDB_SUCCESS) {
686                         saved_errno = errno;
687                         goto fail_errno;
688                 }
689         }
690
691         /* call their open hook if they gave us one. */
692         if (ntdb->openhook) {
693                 ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
694                 if (ecode != NTDB_SUCCESS) {
695                         ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
696                                    "ntdb_open: open hook failed");
697                         goto fail;
698                 }
699                 open_flags |= O_CREAT;
700         }
701
702         /* If they used O_TRUNC, read will return 0. */
703         rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
704         if (rlen == 0 && (open_flags & O_CREAT)) {
705                 ecode = ntdb_new_database(ntdb, seed, &hdr);
706                 if (ecode != NTDB_SUCCESS) {
707                         goto fail;
708                 }
709         } else if (rlen < 0) {
710                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
711                                    "ntdb_open: error %s reading %s",
712                                    strerror(errno), name);
713                 goto fail;
714         } else if (rlen < sizeof(hdr)
715                    || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
716                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
717                                    "ntdb_open: %s is not a ntdb file", name);
718                 goto fail;
719         }
720
721         if (hdr.version != NTDB_VERSION) {
722                 if (hdr.version == bswap_64(NTDB_VERSION))
723                         ntdb->flags |= NTDB_CONVERT;
724                 else {
725                         /* wrong version */
726                         ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
727                                            "ntdb_open:"
728                                            " %s is unknown version 0x%llx",
729                                            name, (long long)hdr.version);
730                         goto fail;
731                 }
732         } else if (ntdb->flags & NTDB_CONVERT) {
733                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
734                                    "ntdb_open:"
735                                    " %s does not need NTDB_CONVERT",
736                                    name);
737                 goto fail;
738         }
739
740         ntdb_context_init(ntdb);
741
742         ntdb_convert(ntdb, &hdr, sizeof(hdr));
743         ntdb->hash_bits = hdr.hash_bits;
744         ntdb->hash_seed = hdr.hash_seed;
745         hash_test = NTDB_HASH_MAGIC;
746         hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
747         if (hdr.hash_test != hash_test) {
748                 /* wrong hash variant */
749                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
750                                    "ntdb_open:"
751                                    " %s uses a different hash function",
752                                    name);
753                 goto fail;
754         }
755
756         ecode = capabilities_ok(ntdb, hdr.capabilities);
757         if (ecode != NTDB_SUCCESS) {
758                 goto fail;
759         }
760
761         /* Clear any features we don't understand. */
762         if ((open_flags & O_ACCMODE) != O_RDONLY) {
763                 hdr.features_used &= NTDB_FEATURE_MASK;
764                 ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
765                                                         features_used),
766                                           &hdr.features_used,
767                                           sizeof(hdr.features_used));
768                 if (ecode != NTDB_SUCCESS)
769                         goto fail;
770         }
771
772         ntdb_unlock_open(ntdb, openlock);
773
774         /* This makes sure we have current map_size and mmap. */
775         ecode = ntdb_oob(ntdb, ntdb->file->map_size, 1, true);
776         if (unlikely(ecode != NTDB_SUCCESS))
777                 goto fail;
778
779         if (ntdb->file->map_size % NTDB_PGSIZE != 0) {
780                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
781                                     "ntdb_open:"
782                                     " %s size %llu isn't a multiple of %u",
783                                     name, (long long)ntdb->file->map_size,
784                                     NTDB_PGSIZE);
785                 goto fail;
786         }
787
788         /* Now it's fully formed, recover if necessary. */
789         berr = ntdb_needs_recovery(ntdb);
790         if (unlikely(berr != false)) {
791                 if (berr < 0) {
792                         ecode = NTDB_OFF_TO_ERR(berr);
793                         goto fail;
794                 }
795                 ecode = ntdb_lock_and_recover(ntdb);
796                 if (ecode != NTDB_SUCCESS) {
797                         goto fail;
798                 }
799         }
800
801         ecode = ntdb_ftable_init(ntdb);
802         if (ecode != NTDB_SUCCESS) {
803                 goto fail;
804         }
805
806         ntdb->next = tdbs;
807         tdbs = ntdb;
808         return ntdb;
809
810  fail:
811         /* Map ecode to some logical errno. */
812         switch (NTDB_ERR_TO_OFF(ecode)) {
813         case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
814         case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
815                 saved_errno = EIO;
816                 break;
817         case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
818                 saved_errno = EWOULDBLOCK;
819                 break;
820         case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
821                 saved_errno = ENOMEM;
822                 break;
823         case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
824                 saved_errno = EINVAL;
825                 break;
826         default:
827                 saved_errno = EINVAL;
828                 break;
829         }
830
831 fail_errno:
832 #ifdef NTDB_TRACE
833         close(ntdb->tracefd);
834 #endif
835         if (ntdb->file) {
836                 ntdb_lock_cleanup(ntdb);
837                 if (--ntdb->file->refcnt == 0) {
838                         assert(ntdb->file->num_lockrecs == 0);
839                         if (ntdb->file->map_ptr) {
840                                 if (ntdb->flags & NTDB_INTERNAL) {
841                                         ntdb->free_fn(ntdb->file->map_ptr,
842                                                       ntdb->alloc_data);
843                                 } else
844                                         ntdb_munmap(ntdb->file);
845                         }
846                         if (ntdb->file->fd != -1 && close(ntdb->file->fd) != 0)
847                                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
848                                            "ntdb_open: failed to close ntdb fd"
849                                            " on error: %s", strerror(errno));
850                         ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
851                         ntdb->free_fn(ntdb->file, ntdb->alloc_data);
852                 }
853         }
854
855         ntdb->free_fn(ntdb, ntdb->alloc_data);
856         errno = saved_errno;
857         return NULL;
858 }
859
860 _PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
861 {
862         int ret = 0;
863         struct ntdb_context **i;
864
865         ntdb_trace(ntdb, "ntdb_close");
866
867         if (ntdb->transaction) {
868                 ntdb_transaction_cancel(ntdb);
869         }
870
871         ntdb_lock_cleanup(ntdb);
872         if (--ntdb->file->refcnt == 0) {
873                 if (ntdb->file->map_ptr) {
874                         if (ntdb->flags & NTDB_INTERNAL) {
875                                 ntdb->free_fn(ntdb->file->map_ptr,
876                                               ntdb->alloc_data);
877                         } else {
878                                 ntdb_munmap(ntdb->file);
879                         }
880                 }
881                 ret = close(ntdb->file->fd);
882                 ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
883                 ntdb->free_fn(ntdb->file, ntdb->alloc_data);
884         }
885
886         /* Remove from tdbs list */
887         for (i = &tdbs; *i; i = &(*i)->next) {
888                 if (*i == ntdb) {
889                         *i = ntdb->next;
890                         break;
891                 }
892         }
893
894 #ifdef NTDB_TRACE
895         close(ntdb->tracefd);
896 #endif
897         ntdb->free_fn(ntdb, ntdb->alloc_data);
898
899         return ret;
900 }
901
902 _PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
903 {
904         struct ntdb_context *i;
905
906         for (i = tdbs; i; i = i->next) {
907                 if (fn(i, p) != 0)
908                         break;
909         }
910 }