ntdb: allocator attribute.
[ddiss/samba.git] / lib / ntdb / open.c
1  /*
2    Trivial Database 2: opening and closing TDBs
3    Copyright (C) Rusty Russell 2010
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 3 of the License, or (at your option) any later version.
9
10    This library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18 #include "private.h"
19 #include <ccan/build_assert/build_assert.h>
20 #include <assert.h>
21
22 /* all tdbs, to detect double-opens (fcntl file don't nest!) */
23 static struct ntdb_context *tdbs = NULL;
24
25 static struct ntdb_file *find_file(dev_t device, ino_t ino)
26 {
27         struct ntdb_context *i;
28
29         for (i = tdbs; i; i = i->next) {
30                 if (i->file->device == device && i->file->inode == ino) {
31                         i->file->refcnt++;
32                         return i->file;
33                 }
34         }
35         return NULL;
36 }
37
38 static bool read_all(int fd, void *buf, size_t len)
39 {
40         while (len) {
41                 ssize_t ret;
42                 ret = read(fd, buf, len);
43                 if (ret < 0)
44                         return false;
45                 if (ret == 0) {
46                         /* ETOOSHORT? */
47                         errno = EWOULDBLOCK;
48                         return false;
49                 }
50                 buf = (char *)buf + ret;
51                 len -= ret;
52         }
53         return true;
54 }
55
56 static uint64_t random_number(struct ntdb_context *ntdb)
57 {
58         int fd;
59         uint64_t ret = 0;
60         struct timeval now;
61
62         fd = open("/dev/urandom", O_RDONLY);
63         if (fd >= 0) {
64                 if (read_all(fd, &ret, sizeof(ret))) {
65                         close(fd);
66                         return ret;
67                 }
68                 close(fd);
69         }
70         /* FIXME: Untested!  Based on Wikipedia protocol description! */
71         fd = open("/dev/egd-pool", O_RDWR);
72         if (fd >= 0) {
73                 /* Command is 1, next byte is size we want to read. */
74                 char cmd[2] = { 1, sizeof(uint64_t) };
75                 if (write(fd, cmd, sizeof(cmd)) == sizeof(cmd)) {
76                         char reply[1 + sizeof(uint64_t)];
77                         int r = read(fd, reply, sizeof(reply));
78                         if (r > 1) {
79                                 /* Copy at least some bytes. */
80                                 memcpy(&ret, reply+1, r - 1);
81                                 if (reply[0] == sizeof(uint64_t)
82                                     && r == sizeof(reply)) {
83                                         close(fd);
84                                         return ret;
85                                 }
86                         }
87                 }
88                 close(fd);
89         }
90
91         /* Fallback: pid and time. */
92         gettimeofday(&now, NULL);
93         ret = getpid() * 100132289ULL + now.tv_sec * 1000000ULL + now.tv_usec;
94         ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
95                    "ntdb_open: random from getpid and time");
96         return ret;
97 }
98
99 static void ntdb_context_init(struct ntdb_context *ntdb)
100 {
101         /* Initialize the NTDB fields here */
102         ntdb_io_init(ntdb);
103         ntdb->direct_access = 0;
104         ntdb->transaction = NULL;
105         ntdb->access = NULL;
106 }
107
108 struct new_database {
109         struct ntdb_header hdr;
110         struct ntdb_freetable ftable;
111         struct ntdb_free_record remainder;
112 };
113
114 /* initialise a new database */
115 static enum NTDB_ERROR ntdb_new_database(struct ntdb_context *ntdb,
116                                        struct ntdb_attribute_seed *seed,
117                                        struct ntdb_header *hdr)
118 {
119         /* We make it up in memory, then write it out if not internal */
120         struct new_database *newdb;
121         unsigned int magic_len;
122         ssize_t rlen;
123         size_t dbsize, remaindersize;
124         enum NTDB_ERROR ecode;
125
126         /* Always make db a multiple of NTDB_PGSIZE */
127         dbsize = (sizeof(*newdb) + NTDB_PGSIZE-1) & ~(NTDB_PGSIZE-1);
128         remaindersize = dbsize - sizeof(*newdb);
129         newdb = ntdb->alloc_fn(ntdb, dbsize, ntdb->alloc_data);
130         if (!newdb) {
131                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
132                                    "ntdb_new_database: failed to allocate");
133         }
134
135         /* Fill in the header */
136         newdb->hdr.version = NTDB_VERSION;
137         if (seed)
138                 newdb->hdr.hash_seed = seed->seed;
139         else
140                 newdb->hdr.hash_seed = random_number(ntdb);
141         newdb->hdr.hash_test = NTDB_HASH_MAGIC;
142         newdb->hdr.hash_test = ntdb->hash_fn(&newdb->hdr.hash_test,
143                                            sizeof(newdb->hdr.hash_test),
144                                            newdb->hdr.hash_seed,
145                                            ntdb->hash_data);
146         newdb->hdr.recovery = 0;
147         newdb->hdr.features_used = newdb->hdr.features_offered = NTDB_FEATURE_MASK;
148         newdb->hdr.seqnum = 0;
149         newdb->hdr.capabilities = 0;
150         memset(newdb->hdr.reserved, 0, sizeof(newdb->hdr.reserved));
151         /* Initial hashes are empty. */
152         memset(newdb->hdr.hashtable, 0, sizeof(newdb->hdr.hashtable));
153
154         /* Free is empty. */
155         newdb->hdr.free_table = offsetof(struct new_database, ftable);
156         memset(&newdb->ftable, 0, sizeof(newdb->ftable));
157         ecode = set_header(NULL, &newdb->ftable.hdr, NTDB_FTABLE_MAGIC, 0,
158                            sizeof(newdb->ftable) - sizeof(newdb->ftable.hdr),
159                            sizeof(newdb->ftable) - sizeof(newdb->ftable.hdr),
160                            0);
161         if (ecode != NTDB_SUCCESS) {
162                 goto out;
163         }
164
165         /* Rest of database is a free record, containing junk. */
166         newdb->remainder.ftable_and_len
167                 = (remaindersize + sizeof(newdb->remainder)
168                    - sizeof(struct ntdb_used_record));
169         newdb->remainder.next = 0;
170         newdb->remainder.magic_and_prev
171                 = (NTDB_FREE_MAGIC << (64-NTDB_OFF_UPPER_STEAL))
172                 | offsetof(struct new_database, remainder);
173         memset(&newdb->remainder + 1, 0x43, remaindersize);
174
175         /* Put in our single free entry. */
176         newdb->ftable.buckets[size_to_bucket(remaindersize)] =
177                 offsetof(struct new_database, remainder);
178
179         /* Magic food */
180         memset(newdb->hdr.magic_food, 0, sizeof(newdb->hdr.magic_food));
181         strcpy(newdb->hdr.magic_food, NTDB_MAGIC_FOOD);
182
183         /* This creates an endian-converted database, as if read from disk */
184         magic_len = sizeof(newdb->hdr.magic_food);
185         ntdb_convert(ntdb,
186                      (char *)&newdb->hdr + magic_len,
187                      sizeof(*newdb) - magic_len);
188
189         *hdr = newdb->hdr;
190
191         if (ntdb->flags & NTDB_INTERNAL) {
192                 ntdb->file->map_size = dbsize;
193                 ntdb->file->map_ptr = newdb;
194                 return NTDB_SUCCESS;
195         }
196         if (lseek(ntdb->file->fd, 0, SEEK_SET) == -1) {
197                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
198                                     "ntdb_new_database:"
199                                     " failed to seek: %s", strerror(errno));
200                 goto out;
201         }
202
203         if (ftruncate(ntdb->file->fd, 0) == -1) {
204                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
205                                     "ntdb_new_database:"
206                                     " failed to truncate: %s", strerror(errno));
207                 goto out;
208         }
209
210         rlen = write(ntdb->file->fd, newdb, dbsize);
211         if (rlen != dbsize) {
212                 if (rlen >= 0)
213                         errno = ENOSPC;
214                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
215                                     "ntdb_new_database: %zi writing header: %s",
216                                     rlen, strerror(errno));
217                 goto out;
218         }
219
220 out:
221         ntdb->free_fn(newdb, ntdb->alloc_data);
222         return ecode;
223 }
224
225 static enum NTDB_ERROR ntdb_new_file(struct ntdb_context *ntdb)
226 {
227         ntdb->file = ntdb->alloc_fn(NULL, sizeof(*ntdb->file), ntdb->alloc_data);
228         if (!ntdb->file)
229                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
230                                   "ntdb_open: cannot alloc ntdb_file structure");
231         ntdb->file->num_lockrecs = 0;
232         ntdb->file->lockrecs = NULL;
233         ntdb->file->allrecord_lock.count = 0;
234         ntdb->file->refcnt = 1;
235         ntdb->file->map_ptr = NULL;
236         return NTDB_SUCCESS;
237 }
238
239 _PUBLIC_ enum NTDB_ERROR ntdb_set_attribute(struct ntdb_context *ntdb,
240                                  const union ntdb_attribute *attr)
241 {
242         switch (attr->base.attr) {
243         case NTDB_ATTRIBUTE_LOG:
244                 ntdb->log_fn = attr->log.fn;
245                 ntdb->log_data = attr->log.data;
246                 break;
247         case NTDB_ATTRIBUTE_HASH:
248         case NTDB_ATTRIBUTE_SEED:
249         case NTDB_ATTRIBUTE_OPENHOOK:
250                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
251                                    NTDB_LOG_USE_ERROR,
252                                    "ntdb_set_attribute:"
253                                    " cannot set %s after opening",
254                                    attr->base.attr == NTDB_ATTRIBUTE_HASH
255                                    ? "NTDB_ATTRIBUTE_HASH"
256                                    : attr->base.attr == NTDB_ATTRIBUTE_SEED
257                                    ? "NTDB_ATTRIBUTE_SEED"
258                                    : "NTDB_ATTRIBUTE_OPENHOOK");
259         case NTDB_ATTRIBUTE_STATS:
260                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
261                                    NTDB_LOG_USE_ERROR,
262                                    "ntdb_set_attribute:"
263                                    " cannot set NTDB_ATTRIBUTE_STATS");
264         case NTDB_ATTRIBUTE_FLOCK:
265                 ntdb->lock_fn = attr->flock.lock;
266                 ntdb->unlock_fn = attr->flock.unlock;
267                 ntdb->lock_data = attr->flock.data;
268                 break;
269         case NTDB_ATTRIBUTE_ALLOCATOR:
270                 ntdb->alloc_fn = attr->alloc.alloc;
271                 ntdb->expand_fn = attr->alloc.expand;
272                 ntdb->free_fn = attr->alloc.free;
273                 ntdb->alloc_data = attr->alloc.priv_data;
274                 break;
275         default:
276                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
277                                    NTDB_LOG_USE_ERROR,
278                                    "ntdb_set_attribute:"
279                                    " unknown attribute type %u",
280                                    attr->base.attr);
281         }
282         return NTDB_SUCCESS;
283 }
284
285 _PUBLIC_ enum NTDB_ERROR ntdb_get_attribute(struct ntdb_context *ntdb,
286                                  union ntdb_attribute *attr)
287 {
288         switch (attr->base.attr) {
289         case NTDB_ATTRIBUTE_LOG:
290                 if (!ntdb->log_fn)
291                         return NTDB_ERR_NOEXIST;
292                 attr->log.fn = ntdb->log_fn;
293                 attr->log.data = ntdb->log_data;
294                 break;
295         case NTDB_ATTRIBUTE_HASH:
296                 attr->hash.fn = ntdb->hash_fn;
297                 attr->hash.data = ntdb->hash_data;
298                 break;
299         case NTDB_ATTRIBUTE_SEED:
300                 attr->seed.seed = ntdb->hash_seed;
301                 break;
302         case NTDB_ATTRIBUTE_OPENHOOK:
303                 if (!ntdb->openhook)
304                         return NTDB_ERR_NOEXIST;
305                 attr->openhook.fn = ntdb->openhook;
306                 attr->openhook.data = ntdb->openhook_data;
307                 break;
308         case NTDB_ATTRIBUTE_STATS: {
309                 size_t size = attr->stats.size;
310                 if (size > ntdb->stats.size)
311                         size = ntdb->stats.size;
312                 memcpy(&attr->stats, &ntdb->stats, size);
313                 break;
314         }
315         case NTDB_ATTRIBUTE_FLOCK:
316                 attr->flock.lock = ntdb->lock_fn;
317                 attr->flock.unlock = ntdb->unlock_fn;
318                 attr->flock.data = ntdb->lock_data;
319                 break;
320         case NTDB_ATTRIBUTE_ALLOCATOR:
321                 attr->alloc.alloc = ntdb->alloc_fn;
322                 attr->alloc.expand = ntdb->expand_fn;
323                 attr->alloc.free = ntdb->free_fn;
324                 attr->alloc.priv_data = ntdb->alloc_data;
325                 break;
326         default:
327                 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
328                                    NTDB_LOG_USE_ERROR,
329                                    "ntdb_get_attribute:"
330                                    " unknown attribute type %u",
331                                    attr->base.attr);
332         }
333         attr->base.next = NULL;
334         return NTDB_SUCCESS;
335 }
336
337 _PUBLIC_ void ntdb_unset_attribute(struct ntdb_context *ntdb,
338                          enum ntdb_attribute_type type)
339 {
340         switch (type) {
341         case NTDB_ATTRIBUTE_LOG:
342                 ntdb->log_fn = NULL;
343                 break;
344         case NTDB_ATTRIBUTE_OPENHOOK:
345                 ntdb->openhook = NULL;
346                 break;
347         case NTDB_ATTRIBUTE_HASH:
348         case NTDB_ATTRIBUTE_SEED:
349                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
350                            "ntdb_unset_attribute: cannot unset %s after opening",
351                            type == NTDB_ATTRIBUTE_HASH
352                            ? "NTDB_ATTRIBUTE_HASH"
353                            : "NTDB_ATTRIBUTE_SEED");
354                 break;
355         case NTDB_ATTRIBUTE_STATS:
356                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
357                            NTDB_LOG_USE_ERROR,
358                            "ntdb_unset_attribute:"
359                            "cannot unset NTDB_ATTRIBUTE_STATS");
360                 break;
361         case NTDB_ATTRIBUTE_FLOCK:
362                 ntdb->lock_fn = ntdb_fcntl_lock;
363                 ntdb->unlock_fn = ntdb_fcntl_unlock;
364                 break;
365         default:
366                 ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
367                            NTDB_LOG_USE_ERROR,
368                            "ntdb_unset_attribute: unknown attribute type %u",
369                            type);
370         }
371 }
372
373 /* The top three bits of the capability tell us whether it matters. */
374 enum NTDB_ERROR unknown_capability(struct ntdb_context *ntdb, const char *caller,
375                                   ntdb_off_t type)
376 {
377         if (type & NTDB_CAP_NOOPEN) {
378                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
379                                   "%s: file has unknown capability %llu",
380                                   caller, type & NTDB_CAP_NOOPEN);
381         }
382
383         if ((type & NTDB_CAP_NOWRITE) && !(ntdb->flags & NTDB_RDONLY)) {
384                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_ERROR,
385                                   "%s: file has unknown capability %llu"
386                                   " (cannot write to it)",
387                                   caller, type & NTDB_CAP_NOOPEN);
388         }
389
390         if (type & NTDB_CAP_NOCHECK) {
391                 ntdb->flags |= NTDB_CANT_CHECK;
392         }
393         return NTDB_SUCCESS;
394 }
395
396 static enum NTDB_ERROR capabilities_ok(struct ntdb_context *ntdb,
397                                       ntdb_off_t capabilities)
398 {
399         ntdb_off_t off, next;
400         enum NTDB_ERROR ecode = NTDB_SUCCESS;
401         const struct ntdb_capability *cap;
402
403         /* Check capability list. */
404         for (off = capabilities; off && ecode == NTDB_SUCCESS; off = next) {
405                 cap = ntdb_access_read(ntdb, off, sizeof(*cap), true);
406                 if (NTDB_PTR_IS_ERR(cap)) {
407                         return NTDB_PTR_ERR(cap);
408                 }
409
410                 switch (cap->type & NTDB_CAP_TYPE_MASK) {
411                 /* We don't understand any capabilities (yet). */
412                 default:
413                         ecode = unknown_capability(ntdb, "ntdb_open", cap->type);
414                 }
415                 next = cap->next;
416                 ntdb_access_release(ntdb, cap);
417         }
418         return ecode;
419 }
420
421 static void *default_alloc(const void *owner, size_t len, void *priv_data)
422 {
423         return malloc(len);
424 }
425
426 static void *default_expand(void *ptr, size_t len, void *priv_data)
427 {
428         return realloc(ptr, len);
429 }
430
431 static void default_free(void *ptr, void *priv_data)
432 {
433         free(ptr);
434 }
435
436 /* First allocation needs manual search of attributes. */
437 static struct ntdb_context *alloc_ntdb(const union ntdb_attribute *attr,
438                                        const char *name)
439 {
440         size_t len = sizeof(struct ntdb_context) + strlen(name) + 1;
441
442         while (attr) {
443                 if  (attr->base.attr == NTDB_ATTRIBUTE_ALLOCATOR) {
444                         return attr->alloc.alloc(NULL, len,
445                                                  attr->alloc.priv_data);
446                 }
447                 attr = attr->base.next;
448         }
449         return default_alloc(NULL, len, NULL);
450 }
451
452 _PUBLIC_ struct ntdb_context *ntdb_open(const char *name, int ntdb_flags,
453                                         int open_flags, mode_t mode,
454                                         union ntdb_attribute *attr)
455 {
456         struct ntdb_context *ntdb;
457         struct stat st;
458         int saved_errno = 0;
459         uint64_t hash_test;
460         unsigned v;
461         ssize_t rlen;
462         struct ntdb_header hdr;
463         struct ntdb_attribute_seed *seed = NULL;
464         ntdb_bool_err berr;
465         enum NTDB_ERROR ecode;
466         int openlock;
467
468         ntdb = alloc_ntdb(attr, name);
469         if (!ntdb) {
470                 /* Can't log this */
471                 errno = ENOMEM;
472                 return NULL;
473         }
474         /* Set name immediately for logging functions. */
475         ntdb->name = strcpy((char *)(ntdb + 1), name);
476         ntdb->flags = ntdb_flags;
477         ntdb->log_fn = NULL;
478         ntdb->open_flags = open_flags;
479         ntdb->file = NULL;
480         ntdb->openhook = NULL;
481         ntdb->lock_fn = ntdb_fcntl_lock;
482         ntdb->unlock_fn = ntdb_fcntl_unlock;
483         ntdb->hash_fn = ntdb_jenkins_hash;
484         memset(&ntdb->stats, 0, sizeof(ntdb->stats));
485         ntdb->stats.base.attr = NTDB_ATTRIBUTE_STATS;
486         ntdb->stats.size = sizeof(ntdb->stats);
487         ntdb->alloc_fn = default_alloc;
488         ntdb->expand_fn = default_expand;
489         ntdb->free_fn = default_free;
490
491         while (attr) {
492                 switch (attr->base.attr) {
493                 case NTDB_ATTRIBUTE_HASH:
494                         ntdb->hash_fn = attr->hash.fn;
495                         ntdb->hash_data = attr->hash.data;
496                         break;
497                 case NTDB_ATTRIBUTE_SEED:
498                         seed = &attr->seed;
499                         break;
500                 case NTDB_ATTRIBUTE_OPENHOOK:
501                         ntdb->openhook = attr->openhook.fn;
502                         ntdb->openhook_data = attr->openhook.data;
503                         break;
504                 default:
505                         /* These are set as normal. */
506                         ecode = ntdb_set_attribute(ntdb, attr);
507                         if (ecode != NTDB_SUCCESS)
508                                 goto fail;
509                 }
510                 attr = attr->base.next;
511         }
512
513         if (ntdb_flags & ~(NTDB_INTERNAL | NTDB_NOLOCK | NTDB_NOMMAP | NTDB_CONVERT
514                           | NTDB_NOSYNC | NTDB_SEQNUM | NTDB_ALLOW_NESTING
515                           | NTDB_RDONLY)) {
516                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
517                                    "ntdb_open: unknown flags %u", ntdb_flags);
518                 goto fail;
519         }
520
521         if (seed) {
522                 if (!(ntdb_flags & NTDB_INTERNAL) && !(open_flags & O_CREAT)) {
523                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
524                                            NTDB_LOG_USE_ERROR,
525                                            "ntdb_open:"
526                                            " cannot set NTDB_ATTRIBUTE_SEED"
527                                            " without O_CREAT.");
528                         goto fail;
529                 }
530         }
531
532         if ((open_flags & O_ACCMODE) == O_WRONLY) {
533                 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
534                                    "ntdb_open: can't open ntdb %s write-only",
535                                    name);
536                 goto fail;
537         }
538
539         if ((open_flags & O_ACCMODE) == O_RDONLY) {
540                 openlock = F_RDLCK;
541                 ntdb->flags |= NTDB_RDONLY;
542         } else {
543                 if (ntdb_flags & NTDB_RDONLY) {
544                         ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL,
545                                            NTDB_LOG_USE_ERROR,
546                                            "ntdb_open: can't use NTDB_RDONLY"
547                                            " without O_RDONLY");
548                         goto fail;
549                 }
550                 openlock = F_WRLCK;
551         }
552
553         /* internal databases don't need any of the rest. */
554         if (ntdb->flags & NTDB_INTERNAL) {
555                 ntdb->flags |= (NTDB_NOLOCK | NTDB_NOMMAP);
556                 ecode = ntdb_new_file(ntdb);
557                 if (ecode != NTDB_SUCCESS) {
558                         goto fail;
559                 }
560                 ntdb->file->fd = -1;
561                 ecode = ntdb_new_database(ntdb, seed, &hdr);
562                 if (ecode == NTDB_SUCCESS) {
563                         ntdb_convert(ntdb, &hdr.hash_seed,
564                                     sizeof(hdr.hash_seed));
565                         ntdb->hash_seed = hdr.hash_seed;
566                         ntdb_context_init(ntdb);
567                         ntdb_ftable_init(ntdb);
568                 }
569                 if (ecode != NTDB_SUCCESS) {
570                         goto fail;
571                 }
572                 return ntdb;
573         }
574
575         if (stat(name, &st) != -1)
576                 ntdb->file = find_file(st.st_dev, st.st_ino);
577
578         if (!ntdb->file) {
579                 ecode = ntdb_new_file(ntdb);
580                 if (ecode != NTDB_SUCCESS) {
581                         goto fail;
582                 }
583
584                 /* Set this now, as ntdb_nest_lock examines it. */
585                 ntdb->file->map_size = 0;
586
587                 if ((ntdb->file->fd = open(name, open_flags, mode)) == -1) {
588                         enum ntdb_log_level lvl;
589                         /* errno set by open(2) */
590                         saved_errno = errno;
591
592                         /* Probing for files like this is a common pattern. */
593                         if (!(open_flags & O_CREAT) && errno == ENOENT) {
594                                 lvl = NTDB_LOG_WARNING;
595                         } else {
596                                 lvl = NTDB_LOG_ERROR;
597                         }
598                         ntdb_logerr(ntdb, NTDB_ERR_IO, lvl,
599                                    "ntdb_open: could not open file %s: %s",
600                                    name, strerror(errno));
601
602                         goto fail_errno;
603                 }
604
605                 /* ensure there is only one process initialising at once:
606                  * do it immediately to reduce the create/openlock race. */
607                 ecode = ntdb_lock_open(ntdb, openlock,
608                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
609                 if (ecode != NTDB_SUCCESS) {
610                         saved_errno = errno;
611                         goto fail_errno;
612                 }
613
614                 /* on exec, don't inherit the fd */
615                 v = fcntl(ntdb->file->fd, F_GETFD, 0);
616                 fcntl(ntdb->file->fd, F_SETFD, v | FD_CLOEXEC);
617
618                 if (fstat(ntdb->file->fd, &st) == -1) {
619                         saved_errno = errno;
620                         ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
621                                    "ntdb_open: could not stat open %s: %s",
622                                    name, strerror(errno));
623                         goto fail_errno;
624                 }
625
626                 ntdb->file->device = st.st_dev;
627                 ntdb->file->inode = st.st_ino;
628         } else {
629                 /* ensure there is only one process initialising at once */
630                 ecode = ntdb_lock_open(ntdb, openlock,
631                                        NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
632                 if (ecode != NTDB_SUCCESS) {
633                         saved_errno = errno;
634                         goto fail_errno;
635                 }
636         }
637
638         /* call their open hook if they gave us one. */
639         if (ntdb->openhook) {
640                 ecode = ntdb->openhook(ntdb->file->fd, ntdb->openhook_data);
641                 if (ecode != NTDB_SUCCESS) {
642                         ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
643                                    "ntdb_open: open hook failed");
644                         goto fail;
645                 }
646                 open_flags |= O_CREAT;
647         }
648
649         /* If they used O_TRUNC, read will return 0. */
650         rlen = pread(ntdb->file->fd, &hdr, sizeof(hdr), 0);
651         if (rlen == 0 && (open_flags & O_CREAT)) {
652                 ecode = ntdb_new_database(ntdb, seed, &hdr);
653                 if (ecode != NTDB_SUCCESS) {
654                         goto fail;
655                 }
656         } else if (rlen < 0) {
657                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
658                                    "ntdb_open: error %s reading %s",
659                                    strerror(errno), name);
660                 goto fail;
661         } else if (rlen < sizeof(hdr)
662                    || strcmp(hdr.magic_food, NTDB_MAGIC_FOOD) != 0) {
663                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
664                                    "ntdb_open: %s is not a ntdb file", name);
665                 goto fail;
666         }
667
668         if (hdr.version != NTDB_VERSION) {
669                 if (hdr.version == bswap_64(NTDB_VERSION))
670                         ntdb->flags |= NTDB_CONVERT;
671                 else {
672                         /* wrong version */
673                         ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
674                                            "ntdb_open:"
675                                            " %s is unknown version 0x%llx",
676                                            name, (long long)hdr.version);
677                         goto fail;
678                 }
679         } else if (ntdb->flags & NTDB_CONVERT) {
680                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
681                                    "ntdb_open:"
682                                    " %s does not need NTDB_CONVERT",
683                                    name);
684                 goto fail;
685         }
686
687         ntdb_context_init(ntdb);
688
689         ntdb_convert(ntdb, &hdr, sizeof(hdr));
690         ntdb->hash_seed = hdr.hash_seed;
691         hash_test = NTDB_HASH_MAGIC;
692         hash_test = ntdb_hash(ntdb, &hash_test, sizeof(hash_test));
693         if (hdr.hash_test != hash_test) {
694                 /* wrong hash variant */
695                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
696                                    "ntdb_open:"
697                                    " %s uses a different hash function",
698                                    name);
699                 goto fail;
700         }
701
702         ecode = capabilities_ok(ntdb, hdr.capabilities);
703         if (ecode != NTDB_SUCCESS) {
704                 goto fail;
705         }
706
707         /* Clear any features we don't understand. */
708         if ((open_flags & O_ACCMODE) != O_RDONLY) {
709                 hdr.features_used &= NTDB_FEATURE_MASK;
710                 ecode = ntdb_write_convert(ntdb, offsetof(struct ntdb_header,
711                                                         features_used),
712                                           &hdr.features_used,
713                                           sizeof(hdr.features_used));
714                 if (ecode != NTDB_SUCCESS)
715                         goto fail;
716         }
717
718         ntdb_unlock_open(ntdb, openlock);
719
720         /* This makes sure we have current map_size and mmap. */
721         ecode = ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
722         if (unlikely(ecode != NTDB_SUCCESS))
723                 goto fail;
724
725         if (ntdb->file->map_size % NTDB_PGSIZE != 0) {
726                 ecode = ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
727                                     "ntdb_open:"
728                                     " %s size %llu isn't a multiple of %u",
729                                     name, (long long)ntdb->file->map_size,
730                                     NTDB_PGSIZE);
731                 goto fail;
732         }
733
734         /* Now it's fully formed, recover if necessary. */
735         berr = ntdb_needs_recovery(ntdb);
736         if (unlikely(berr != false)) {
737                 if (berr < 0) {
738                         ecode = NTDB_OFF_TO_ERR(berr);
739                         goto fail;
740                 }
741                 ecode = ntdb_lock_and_recover(ntdb);
742                 if (ecode != NTDB_SUCCESS) {
743                         goto fail;
744                 }
745         }
746
747         ecode = ntdb_ftable_init(ntdb);
748         if (ecode != NTDB_SUCCESS) {
749                 goto fail;
750         }
751
752         ntdb->next = tdbs;
753         tdbs = ntdb;
754         return ntdb;
755
756  fail:
757         /* Map ecode to some logical errno. */
758         switch (NTDB_ERR_TO_OFF(ecode)) {
759         case NTDB_ERR_TO_OFF(NTDB_ERR_CORRUPT):
760         case NTDB_ERR_TO_OFF(NTDB_ERR_IO):
761                 saved_errno = EIO;
762                 break;
763         case NTDB_ERR_TO_OFF(NTDB_ERR_LOCK):
764                 saved_errno = EWOULDBLOCK;
765                 break;
766         case NTDB_ERR_TO_OFF(NTDB_ERR_OOM):
767                 saved_errno = ENOMEM;
768                 break;
769         case NTDB_ERR_TO_OFF(NTDB_ERR_EINVAL):
770                 saved_errno = EINVAL;
771                 break;
772         default:
773                 saved_errno = EINVAL;
774                 break;
775         }
776
777 fail_errno:
778 #ifdef NTDB_TRACE
779         close(ntdb->tracefd);
780 #endif
781         if (ntdb->file) {
782                 ntdb_lock_cleanup(ntdb);
783                 if (--ntdb->file->refcnt == 0) {
784                         assert(ntdb->file->num_lockrecs == 0);
785                         if (ntdb->file->map_ptr) {
786                                 if (ntdb->flags & NTDB_INTERNAL) {
787                                         ntdb->free_fn(ntdb->file->map_ptr,
788                                                       ntdb->alloc_data);
789                                 } else
790                                         ntdb_munmap(ntdb->file);
791                         }
792                         if (ntdb->file->fd != -1 && close(ntdb->file->fd) != 0)
793                                 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
794                                            "ntdb_open: failed to close ntdb fd"
795                                            " on error: %s", strerror(errno));
796                         ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
797                         ntdb->free_fn(ntdb->file, ntdb->alloc_data);
798                 }
799         }
800
801         ntdb->free_fn(ntdb, ntdb->alloc_data);
802         errno = saved_errno;
803         return NULL;
804 }
805
806 _PUBLIC_ int ntdb_close(struct ntdb_context *ntdb)
807 {
808         int ret = 0;
809         struct ntdb_context **i;
810
811         ntdb_trace(ntdb, "ntdb_close");
812
813         if (ntdb->transaction) {
814                 ntdb_transaction_cancel(ntdb);
815         }
816
817         if (ntdb->file->map_ptr) {
818                 if (ntdb->flags & NTDB_INTERNAL)
819                         ntdb->free_fn(ntdb->file->map_ptr, ntdb->alloc_data);
820                 else
821                         ntdb_munmap(ntdb->file);
822         }
823         if (ntdb->file) {
824                 ntdb_lock_cleanup(ntdb);
825                 if (--ntdb->file->refcnt == 0) {
826                         ret = close(ntdb->file->fd);
827                         ntdb->free_fn(ntdb->file->lockrecs, ntdb->alloc_data);
828                         ntdb->free_fn(ntdb->file, ntdb->alloc_data);
829                 }
830         }
831
832         /* Remove from tdbs list */
833         for (i = &tdbs; *i; i = &(*i)->next) {
834                 if (*i == ntdb) {
835                         *i = ntdb->next;
836                         break;
837                 }
838         }
839
840 #ifdef NTDB_TRACE
841         close(ntdb->tracefd);
842 #endif
843         ntdb->free_fn(ntdb, ntdb->alloc_data);
844
845         return ret;
846 }
847
848 _PUBLIC_ void ntdb_foreach_(int (*fn)(struct ntdb_context *, void *), void *p)
849 {
850         struct ntdb_context *i;
851
852         for (i = tdbs; i; i = i->next) {
853                 if (fn(i, p) != 0)
854                         break;
855         }
856 }