2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb_private.h"
30 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
31 static struct tdb_context *tdbs = NULL;
34 -------------------------------------------------------------------------------
35 lookup3.c, by Bob Jenkins, May 2006, Public Domain.
37 These are functions for producing 32-bit hashes for hash table lookup.
38 tdb_jenkins_hash_word(), tdb_jenkins_hashlittle(), tdb_jenkins_hashlittle2(), tdb_jenkins_hashbig(),
39 tdb_jenkins_mix(), and tdb_jenkins_final() are externally useful functions. Routines to test the hash are included
40 if SELF_TEST is defined. You can use this free for any purpose. It's in
41 the public domain. It has no warranty.
43 You probably want to use tdb_jenkins_hashlittle().
44 tdb_jenkins_hashlittle() and tdb_jenkins_hashbig() hash byte arrays.
45 tdb_jenkins_hashlittle() is is faster than tdb_jenkins_hashbig() on
46 little-endian machines. Intel and AMD are little-endian machines.
47 On second thought, you probably want tdb_jenkins_hashlittle2(), which is identical to
48 tdb_jenkins_hashlittle() except it returns two 32-bit hashes for the price of one.
49 You could implement tdb_jenkins_hashbig2() if you wanted but I haven't bothered here.
51 If you want to find a hash of, say, exactly 7 integers, do
52 a = i1; b = i2; c = i3;
53 tdb_jenkins_mix(a,b,c);
54 a += i4; b += i5; c += i6;
55 tdb_jenkins_mix(a,b,c);
57 tdb_jenkins_final(a,b,c);
58 then use c as the hash value. If you have a variable length array of
59 4-byte integers to hash, use tdb_jenkins_hash_word(). If you have a byte array (like
60 a character string), use tdb_jenkins_hashlittle(). If you have several byte arrays, or
61 a mix of things, see the comments above tdb_jenkins_hashlittle().
63 Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
64 then mix those integers. This is fast (you can do a lot more thorough
65 mixing with 12*3 instructions on 3 integers than you can with 3 instructions
66 on 1 byte), but shoehorning those bytes into integers efficiently is messy.
67 -------------------------------------------------------------------------------
71 # include <endian.h> /* attempt to define endianness */
75 * My best guess at if you are big-endian or little-endian. This may
78 #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
79 __BYTE_ORDER == __LITTLE_ENDIAN) || \
80 (defined(i386) || defined(__i386__) || defined(__i486__) || \
81 defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL))
82 # define HASH_LITTLE_ENDIAN 1
83 # define HASH_BIG_ENDIAN 0
84 #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
85 __BYTE_ORDER == __BIG_ENDIAN) || \
86 (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
87 # define HASH_LITTLE_ENDIAN 0
88 # define HASH_BIG_ENDIAN 1
90 # error Unknown endian
93 #define tdb_jenkins_hashsize(n) ((uint32_t)1<<(n))
94 #define tdb_jenkins_hashmask(n) (tdb_jenkins_hashsize(n)-1)
95 #define tdb_jenkins_rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
98 -------------------------------------------------------------------------------
99 tdb_jenkins_mix -- mix 3 32-bit values reversibly.
101 This is reversible, so any information in (a,b,c) before tdb_jenkins_mix() is
102 still in (a,b,c) after tdb_jenkins_mix().
104 If four pairs of (a,b,c) inputs are run through tdb_jenkins_mix(), or through
105 tdb_jenkins_mix() in reverse, there are at least 32 bits of the output that
106 are sometimes the same for one pair and different for another pair.
108 * pairs that differed by one bit, by two bits, in any combination
109 of top bits of (a,b,c), or in any combination of bottom bits of
111 * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
112 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
113 is commonly produced by subtraction) look like a single 1-bit
115 * the base values were pseudorandom, all zero but one bit set, or
116 all zero plus a counter that starts at zero.
118 Some k values for my "a-=c; a^=tdb_jenkins_rot(c,k); c+=b;" arrangement that
123 Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
124 for "differ" defined as + with a one-bit base and a two-bit delta. I
125 used http://burtleburtle.net/bob/hash/avalanche.html to choose
126 the operations, constants, and arrangements of the variables.
128 This does not achieve avalanche. There are input bits of (a,b,c)
129 that fail to affect some output bits of (a,b,c), especially of a. The
130 most thoroughly mixed value is c, but it doesn't really even achieve
133 This allows some parallelism. Read-after-writes are good at doubling
134 the number of bits affected, so the goal of mixing pulls in the opposite
135 direction as the goal of parallelism. I did what I could. Rotates
136 seem to cost as much as shifts on every machine I could lay my hands
137 on, and rotates are much kinder to the top and bottom bits, so I used
139 -------------------------------------------------------------------------------
141 #define tdb_jenkins_mix(a,b,c) \
143 a -= c; a ^= tdb_jenkins_rot(c, 4); c += b; \
144 b -= a; b ^= tdb_jenkins_rot(a, 6); a += c; \
145 c -= b; c ^= tdb_jenkins_rot(b, 8); b += a; \
146 a -= c; a ^= tdb_jenkins_rot(c,16); c += b; \
147 b -= a; b ^= tdb_jenkins_rot(a,19); a += c; \
148 c -= b; c ^= tdb_jenkins_rot(b, 4); b += a; \
152 -------------------------------------------------------------------------------
153 tdb_jenkins_final -- final mixing of 3 32-bit values (a,b,c) into c
155 Pairs of (a,b,c) values differing in only a few bits will usually
156 produce values of c that look totally different. This was tested for
157 * pairs that differed by one bit, by two bits, in any combination
158 of top bits of (a,b,c), or in any combination of bottom bits of
160 * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
161 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
162 is commonly produced by subtraction) look like a single 1-bit
164 * the base values were pseudorandom, all zero but one bit set, or
165 all zero plus a counter that starts at zero.
167 These constants passed:
170 and these came close:
174 -------------------------------------------------------------------------------
176 #define tdb_jenkins_final(a,b,c) \
178 c ^= b; c -= tdb_jenkins_rot(b,14); \
179 a ^= c; a -= tdb_jenkins_rot(c,11); \
180 b ^= a; b -= tdb_jenkins_rot(a,25); \
181 c ^= b; c -= tdb_jenkins_rot(b,16); \
182 a ^= c; a -= tdb_jenkins_rot(c,4); \
183 b ^= a; b -= tdb_jenkins_rot(a,14); \
184 c ^= b; c -= tdb_jenkins_rot(b,24); \
188 -------------------------------------------------------------------------------
189 tdb_jenkins_hashlittle() -- hash a variable-length key into a 32-bit value
190 k : the key (the unaligned variable-length array of bytes)
191 length : the length of the key, counting by bytes
192 val2 : IN: can be any 4-byte value OUT: second 32 bit hash.
193 Returns a 32-bit value. Every bit of the key affects every bit of
194 the return value. Two keys differing by one or two bits will have
195 totally different hash values. Note that the return value is better
196 mixed than val2, so use that first.
198 The best hash table sizes are powers of 2. There is no need to do
199 mod a prime (mod is sooo slow!). If you need less than 32 bits,
200 use a bitmask. For example, if you need only 10 bits, do
201 h = (h & tdb_jenkins_hashmask(10));
202 In which case, the hash table should have tdb_jenkins_hashsize(10) elements.
204 If you are hashing n strings (uint8_t **)k, do it like this:
205 for (i=0, h=0; i<n; ++i) h = tdb_jenkins_hashlittle( k[i], len[i], h);
207 By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
208 code any way you wish, private, educational, or commercial. It's free.
210 Use for hash table lookup, or anything where one collision in 2^^32 is
211 acceptable. Do NOT use for cryptographic purposes.
212 -------------------------------------------------------------------------------
215 static uint32_t tdb_jenkins_hashlittle( const void *key, size_t length, uint32_t *val2 )
217 uint32_t a,b,c; /* internal state */
218 union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */
220 /* Set up the internal state */
221 a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
224 if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
225 const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
230 /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
236 tdb_jenkins_mix(a,b,c);
241 /*----------------------------- handle the last (probably partial) block */
243 * "k[2]&0xffffff" actually reads beyond the end of the string, but
244 * then masks off the part it's not allowed to read. Because the
245 * string is aligned, the masked-off tail is in the same word as the
246 * rest of the string. Every machine with memory protection I've seen
247 * does it on word boundaries, so is OK with this. But VALGRIND will
248 * still catch it and complain. The masking trick does make the hash
249 * noticably faster for short strings (like English words).
255 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
256 case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
257 case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
258 case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
259 case 8 : b+=k[1]; a+=k[0]; break;
260 case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
261 case 6 : b+=k[1]&0xffff; a+=k[0]; break;
262 case 5 : b+=k[1]&0xff; a+=k[0]; break;
263 case 4 : a+=k[0]; break;
264 case 3 : a+=k[0]&0xffffff; break;
265 case 2 : a+=k[0]&0xffff; break;
266 case 1 : a+=k[0]&0xff; break;
267 case 0 : return c; /* zero length strings require no mixing */
270 #else /* make valgrind happy */
272 k8 = (const uint8_t *)k;
275 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
276 case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
277 case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
278 case 9 : c+=k8[8]; /* fall through */
279 case 8 : b+=k[1]; a+=k[0]; break;
280 case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
281 case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
282 case 5 : b+=k8[4]; /* fall through */
283 case 4 : a+=k[0]; break;
284 case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
285 case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
286 case 1 : a+=k8[0]; break;
290 #endif /* !valgrind */
292 } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
293 const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */
296 /*--------------- all but last block: aligned reads and different mixing */
299 a += k[0] + (((uint32_t)k[1])<<16);
300 b += k[2] + (((uint32_t)k[3])<<16);
301 c += k[4] + (((uint32_t)k[5])<<16);
302 tdb_jenkins_mix(a,b,c);
307 /*----------------------------- handle the last (probably partial) block */
308 k8 = (const uint8_t *)k;
311 case 12: c+=k[4]+(((uint32_t)k[5])<<16);
312 b+=k[2]+(((uint32_t)k[3])<<16);
313 a+=k[0]+(((uint32_t)k[1])<<16);
315 case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
317 b+=k[2]+(((uint32_t)k[3])<<16);
318 a+=k[0]+(((uint32_t)k[1])<<16);
320 case 9 : c+=k8[8]; /* fall through */
321 case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
322 a+=k[0]+(((uint32_t)k[1])<<16);
324 case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
326 a+=k[0]+(((uint32_t)k[1])<<16);
328 case 5 : b+=k8[4]; /* fall through */
329 case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
331 case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
336 case 0 : return c; /* zero length requires no mixing */
339 } else { /* need to read the key one byte at a time */
340 const uint8_t *k = (const uint8_t *)key;
342 /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
346 a += ((uint32_t)k[1])<<8;
347 a += ((uint32_t)k[2])<<16;
348 a += ((uint32_t)k[3])<<24;
350 b += ((uint32_t)k[5])<<8;
351 b += ((uint32_t)k[6])<<16;
352 b += ((uint32_t)k[7])<<24;
354 c += ((uint32_t)k[9])<<8;
355 c += ((uint32_t)k[10])<<16;
356 c += ((uint32_t)k[11])<<24;
357 tdb_jenkins_mix(a,b,c);
362 /*-------------------------------- last block: affect all 32 bits of (c) */
363 switch(length) /* all the case statements fall through */
365 case 12: c+=((uint32_t)k[11])<<24;
366 case 11: c+=((uint32_t)k[10])<<16;
367 case 10: c+=((uint32_t)k[9])<<8;
369 case 8 : b+=((uint32_t)k[7])<<24;
370 case 7 : b+=((uint32_t)k[6])<<16;
371 case 6 : b+=((uint32_t)k[5])<<8;
373 case 4 : a+=((uint32_t)k[3])<<24;
374 case 3 : a+=((uint32_t)k[2])<<16;
375 case 2 : a+=((uint32_t)k[1])<<8;
382 tdb_jenkins_final(a,b,c);
388 * tdb_jenkins_hashbig():
389 * This is the same as tdb_jenkins_hash_word() on big-endian machines. It is different
390 * from tdb_jenkins_hashlittle() on all machines. tdb_jenkins_hashbig() takes advantage of
391 * big-endian byte ordering.
393 static uint32_t tdb_jenkins_hashbig( const void *key, size_t length, uint32_t *val2)
396 union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */
398 /* Set up the internal state */
399 a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
402 if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) {
403 const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
408 /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
414 tdb_jenkins_mix(a,b,c);
419 /*----------------------------- handle the last (probably partial) block */
421 * "k[2]<<8" actually reads beyond the end of the string, but
422 * then shifts out the part it's not allowed to read. Because the
423 * string is aligned, the illegal read is in the same word as the
424 * rest of the string. Every machine with memory protection I've seen
425 * does it on word boundaries, so is OK with this. But VALGRIND will
426 * still catch it and complain. The masking trick does make the hash
427 * noticably faster for short strings (like English words).
433 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
434 case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break;
435 case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break;
436 case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break;
437 case 8 : b+=k[1]; a+=k[0]; break;
438 case 7 : b+=k[1]&0xffffff00; a+=k[0]; break;
439 case 6 : b+=k[1]&0xffff0000; a+=k[0]; break;
440 case 5 : b+=k[1]&0xff000000; a+=k[0]; break;
441 case 4 : a+=k[0]; break;
442 case 3 : a+=k[0]&0xffffff00; break;
443 case 2 : a+=k[0]&0xffff0000; break;
444 case 1 : a+=k[0]&0xff000000; break;
445 case 0 : return c; /* zero length strings require no mixing */
448 #else /* make valgrind happy */
450 k8 = (const uint8_t *)k;
451 switch(length) /* all the case statements fall through */
453 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
454 case 11: c+=((uint32_t)k8[10])<<8; /* fall through */
455 case 10: c+=((uint32_t)k8[9])<<16; /* fall through */
456 case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */
457 case 8 : b+=k[1]; a+=k[0]; break;
458 case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */
459 case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */
460 case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */
461 case 4 : a+=k[0]; break;
462 case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */
463 case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */
464 case 1 : a+=((uint32_t)k8[0])<<24; break;
468 #endif /* !VALGRIND */
470 } else { /* need to read the key one byte at a time */
471 const uint8_t *k = (const uint8_t *)key;
473 /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
476 a += ((uint32_t)k[0])<<24;
477 a += ((uint32_t)k[1])<<16;
478 a += ((uint32_t)k[2])<<8;
479 a += ((uint32_t)k[3]);
480 b += ((uint32_t)k[4])<<24;
481 b += ((uint32_t)k[5])<<16;
482 b += ((uint32_t)k[6])<<8;
483 b += ((uint32_t)k[7]);
484 c += ((uint32_t)k[8])<<24;
485 c += ((uint32_t)k[9])<<16;
486 c += ((uint32_t)k[10])<<8;
487 c += ((uint32_t)k[11]);
488 tdb_jenkins_mix(a,b,c);
493 /*-------------------------------- last block: affect all 32 bits of (c) */
494 switch(length) /* all the case statements fall through */
497 case 11: c+=((uint32_t)k[10])<<8;
498 case 10: c+=((uint32_t)k[9])<<16;
499 case 9 : c+=((uint32_t)k[8])<<24;
501 case 7 : b+=((uint32_t)k[6])<<8;
502 case 6 : b+=((uint32_t)k[5])<<16;
503 case 5 : b+=((uint32_t)k[4])<<24;
505 case 3 : a+=((uint32_t)k[2])<<8;
506 case 2 : a+=((uint32_t)k[1])<<16;
507 case 1 : a+=((uint32_t)k[0])<<24;
513 tdb_jenkins_final(a,b,c);
518 static uint32_t tdb_jenkins_hash_any(const void *key, size_t length, uint32_t base)
521 return tdb_jenkins_hashbig(key, length, &base);
523 return tdb_jenkins_hashlittle(key, length, &base);
526 #define TDB1_RWLOCK_JENKINS_MAGIC 0x4A414E4B
528 static unsigned int tdb_jenkins_hash(TDB_DATA *key)
530 return tdb_jenkins_hash_any(key->dptr, key->dsize, 0);
534 /* This is based on the hash algorithm from gdbm */
535 static unsigned int default_tdb_hash(TDB_DATA *key)
537 uint32_t value; /* Used to compute the hash value. */
538 uint32_t i; /* Used to cycle through random values. */
540 /* Set the initial value from the key size. */
541 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
542 value = (value + (key->dptr[i] << (i*5 % 24)));
544 return (1103515243 * value + 12345);
547 /* We use two hashes to double-check they're using the right hash function. */
548 void tdb_header_hash(struct tdb_context *tdb,
549 uint32_t *magic1_hash, uint32_t *magic2_hash)
552 uint32_t tdb_magic = TDB_MAGIC;
554 hash_key.dptr = discard_const_p(uint8_t, TDB_MAGIC_FOOD);
555 hash_key.dsize = sizeof(TDB_MAGIC_FOOD);
556 *magic1_hash = tdb->hash_fn(&hash_key);
558 hash_key.dptr = CONVERT(tdb_magic);
559 hash_key.dsize = sizeof(tdb_magic);
560 *magic2_hash = tdb->hash_fn(&hash_key);
562 /* Make sure at least one hash is non-zero! */
563 if (*magic1_hash == 0 && *magic2_hash == 0)
567 /* initialise a new database with a specified hash size */
568 static int tdb_new_database(struct tdb_context *tdb, int hash_size)
570 struct tdb_header *newdb;
574 /* We make it up in memory, then write it out if not internal */
575 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t);
576 if (!(newdb = (struct tdb_header *)calloc(size, 1))) {
577 tdb->ecode = TDB_ERR_OOM;
581 /* Fill in the header */
582 newdb->version = TDB_VERSION;
583 newdb->hash_size = hash_size;
585 tdb_header_hash(tdb, &newdb->magic1_hash, &newdb->magic2_hash);
587 if (tdb->hash_fn == tdb_jenkins_hash) {
588 newdb->rwlocks = TDB1_RWLOCK_JENKINS_MAGIC;
591 if (tdb->flags & TDB_INTERNAL) {
592 tdb->map_size = size;
593 tdb->map_ptr = (char *)newdb;
594 memcpy(&tdb->header, newdb, sizeof(tdb->header));
595 /* Convert the `ondisk' version if asked. */
599 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
602 if (ftruncate(tdb->fd, 0) == -1)
605 /* This creates an endian-converted header, as if read from disk */
607 memcpy(&tdb->header, newdb, sizeof(tdb->header));
608 /* Don't endian-convert the magic food! */
609 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
610 /* we still have "ret == -1" here */
611 if (tdb_write_all(tdb->fd, newdb, size))
621 static int tdb_already_open(dev_t device,
624 struct tdb_context *i;
626 for (i = tdbs; i; i = i->next) {
627 if (i->device == device && i->inode == ino) {
635 /* open the database, creating it if necessary
637 The open_flags and mode are passed straight to the open call on the
638 database file. A flags value of O_WRONLY is invalid. The hash size
639 is advisory, use zero for a default value.
641 Return is NULL on error, in which case errno is also set. Don't
642 try to call tdb_error or tdb_errname, just do strerror(errno).
644 @param name may be NULL for internal databases. */
645 struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
646 int open_flags, mode_t mode)
648 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
651 /* a default logging function */
652 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...) PRINTF_ATTRIBUTE(3, 4);
653 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
658 struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
659 int open_flags, mode_t mode,
660 const struct tdb_logging_context *log_ctx,
661 tdb_hash_func hash_fn)
663 struct tdb_context *tdb;
665 int rev = 0, locked = 0;
669 uint32_t magic1_hash;
670 uint32_t magic2_hash;
671 const char *hash_alg;
672 uint32_t expected_rwlocks = 0;
674 if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
686 tdb->flags = tdb_flags;
687 tdb->open_flags = open_flags;
691 tdb->log.log_fn = null_log_fn;
692 tdb->log.log_private = NULL;
696 tdb->hash_fn = hash_fn;
697 hash_alg = "user defined";
699 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
700 tdb->hash_fn = tdb_jenkins_hash;
701 hash_alg = "jenkins";
703 tdb->hash_fn = default_tdb_hash;
704 hash_alg = "default";
708 /* cache the page size */
709 tdb->page_size = getpagesize();
710 if (tdb->page_size <= 0) {
711 tdb->page_size = 0x2000;
714 tdb->max_dead_records = (tdb_flags & TDB_VOLATILE) ? 5 : 0;
716 if ((open_flags & O_ACCMODE) == O_WRONLY) {
717 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
724 hash_size = DEFAULT_HASH_SIZE;
725 if ((open_flags & O_ACCMODE) == O_RDONLY) {
727 /* read only databases don't do locking or clear if first */
728 tdb->flags |= TDB_NOLOCK;
729 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
732 if ((tdb->flags & TDB_ALLOW_NESTING) &&
733 (tdb->flags & TDB_DISALLOW_NESTING)) {
734 tdb->ecode = TDB_ERR_NESTING;
735 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
736 "allow_nesting and disallow_nesting are not allowed together!"));
741 if (getenv("TDB_NO_FSYNC")) {
742 tdb->flags |= TDB_NOSYNC;
746 * TDB_ALLOW_NESTING is the default behavior.
747 * Note: this may change in future versions!
749 if (!(tdb->flags & TDB_DISALLOW_NESTING)) {
750 tdb->flags |= TDB_ALLOW_NESTING;
753 /* internal databases don't mmap or lock, and start off cleared */
754 if (tdb->flags & TDB_INTERNAL) {
755 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
756 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
757 if (tdb_new_database(tdb, hash_size) != 0) {
758 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: tdb_new_database failed!"));
764 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
765 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_open_ex: could not open file %s: %s\n",
766 name, strerror(errno)));
767 goto fail; /* errno set by open(2) */
770 /* on exec, don't inherit the fd */
771 v = fcntl(tdb->fd, F_GETFD, 0);
772 fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC);
774 /* ensure there is only one process initialising at once */
775 if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
776 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get open lock on %s: %s\n",
777 name, strerror(errno)));
778 goto fail; /* errno set by tdb_brlock */
781 /* we need to zero database if we are the only one with it open */
782 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
784 (locked = (tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE) == 0))) {
785 open_flags |= O_CREAT;
786 if (ftruncate(tdb->fd, 0) == -1) {
787 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
788 "failed to truncate %s: %s\n",
789 name, strerror(errno)));
790 goto fail; /* errno set by ftruncate */
795 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
796 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
797 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
799 errno = EIO; /* ie bad format or something */
803 rev = (tdb->flags & TDB_CONVERT);
804 } else if (tdb->header.version != TDB_VERSION
805 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION)))) {
810 vp = (unsigned char *)&tdb->header.version;
811 vertest = (((uint32_t)vp[0]) << 24) | (((uint32_t)vp[1]) << 16) |
812 (((uint32_t)vp[2]) << 8) | (uint32_t)vp[3];
813 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
815 tdb->flags &= ~TDB_CONVERT;
817 tdb->flags |= TDB_CONVERT;
818 tdb_convert(&tdb->header, sizeof(tdb->header));
820 if (fstat(tdb->fd, &st) == -1)
823 if (tdb->hash_fn == tdb_jenkins_hash &&
824 tdb->header.rwlocks != TDB1_RWLOCK_JENKINS_MAGIC) {
825 tdb->hash_fn = default_tdb_hash;
826 hash_alg = "default";
828 if (tdb->hash_fn == default_tdb_hash &&
829 tdb->header.rwlocks == TDB1_RWLOCK_JENKINS_MAGIC) {
830 tdb->hash_fn = tdb_jenkins_hash;
831 hash_alg = "jenkins";
834 tdb_header_hash(tdb, &magic1_hash, &magic2_hash);
836 if ((tdb->header.magic1_hash == 0) && (tdb->header.magic2_hash == 0)) {
837 /* older TDB without magic hash references */
838 if (hash_fn == NULL) {
839 tdb->hash_fn = default_tdb_hash;
840 hash_alg = "default";
842 } else if ((tdb->header.magic1_hash != magic1_hash) ||
843 (tdb->header.magic2_hash != magic2_hash)) {
844 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
845 "%s was not created with the %s hash function we are using\n"
846 "magic1_hash[0x%08X %s 0x%08X] "
847 "magic2_hash[0x%08X %s 0x%08X]\n",
849 tdb->header.magic1_hash,
850 (tdb->header.magic1_hash == magic1_hash) ? "==" : "!=",
852 tdb->header.magic2_hash,
853 (tdb->header.magic2_hash == magic2_hash) ? "==" : "!=",
858 if (tdb->hash_fn == tdb_jenkins_hash) {
859 expected_rwlocks = TDB1_RWLOCK_JENKINS_MAGIC;
863 if (tdb->header.rwlocks != expected_rwlocks) {
864 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
868 /* Is it already in the open list? If so, fail. */
869 if (tdb_already_open(st.st_dev, st.st_ino)) {
870 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
871 "%s (%d,%d) is already open in this process\n",
872 name, (int)st.st_dev, (int)st.st_ino));
877 if (!(tdb->name = (char *)strdup(name))) {
882 tdb->map_size = st.st_size;
883 tdb->device = st.st_dev;
884 tdb->inode = st.st_ino;
887 if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
888 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
889 "failed to release ACTIVE_LOCK on %s: %s\n",
890 name, strerror(errno)));
896 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
897 we didn't get the initial exclusive lock as we need to let all other
898 users know we're using it. */
900 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
901 /* leave this lock in place to indicate it's in use */
902 if (tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
907 /* if needed, run recovery */
908 if (tdb_transaction_recover(tdb) == -1) {
914 char tracefile[strlen(name) + 32];
916 snprintf(tracefile, sizeof(tracefile),
917 "%s.trace.%li", name, (long)getpid());
918 tdb->tracefd = open(tracefile, O_WRONLY|O_CREAT|O_EXCL, 0600);
919 if (tdb->tracefd >= 0) {
920 tdb_enable_seqnum(tdb);
921 tdb_trace_open(tdb, "tdb_open", hash_size, tdb_flags,
924 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to open trace file %s!\n", tracefile));
929 /* Internal (memory-only) databases skip all the code above to
930 * do with disk files, and resume here by releasing their
931 * open lock and hooking into the active list. */
932 if (tdb_nest_unlock(tdb, OPEN_LOCK, F_WRLCK, false) == -1) {
940 { int save_errno = errno;
949 if (tdb->flags & TDB_INTERNAL)
950 SAFE_FREE(tdb->map_ptr);
954 SAFE_FREE(tdb->name);
956 if (close(tdb->fd) != 0)
957 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
958 SAFE_FREE(tdb->lockrecs);
966 * Set the maximum number of dead records per hash chain
969 void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
971 tdb->max_dead_records = max_dead;
977 * @returns -1 for error; 0 for success.
979 int tdb_close(struct tdb_context *tdb)
981 struct tdb_context **i;
984 if (tdb->transaction) {
985 tdb_transaction_cancel(tdb);
987 tdb_trace(tdb, "tdb_close");
990 if (tdb->flags & TDB_INTERNAL)
991 SAFE_FREE(tdb->map_ptr);
995 SAFE_FREE(tdb->name);
997 ret = close(tdb->fd);
1000 SAFE_FREE(tdb->lockrecs);
1002 /* Remove from contexts list */
1003 for (i = &tdbs; *i; i = &(*i)->next) {
1011 close(tdb->tracefd);
1013 memset(tdb, 0, sizeof(*tdb));
1019 /* register a loging function */
1020 void tdb_set_logging_function(struct tdb_context *tdb,
1021 const struct tdb_logging_context *log_ctx)
1023 tdb->log = *log_ctx;
1026 void *tdb_get_logging_private(struct tdb_context *tdb)
1028 return tdb->log.log_private;
1031 static int tdb_reopen_internal(struct tdb_context *tdb, bool active_lock)
1033 #if !defined(LIBREPLACE_PREAD_NOT_REPLACED) || \
1034 !defined(LIBREPLACE_PWRITE_NOT_REPLACED)
1038 if (tdb->flags & TDB_INTERNAL) {
1039 return 0; /* Nothing to do. */
1042 if (tdb_have_extra_locks(tdb)) {
1043 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
1047 if (tdb->transaction != 0) {
1048 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed inside a transaction\n"));
1052 /* If we have real pread & pwrite, we can skip reopen. */
1053 #if !defined(LIBREPLACE_PREAD_NOT_REPLACED) || \
1054 !defined(LIBREPLACE_PWRITE_NOT_REPLACED)
1055 if (tdb_munmap(tdb) != 0) {
1056 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1059 if (close(tdb->fd) != 0)
1060 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1061 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1062 if (tdb->fd == -1) {
1063 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1066 if (fstat(tdb->fd, &st) != 0) {
1067 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1070 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1071 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: file dev/inode has changed!\n"));
1075 #endif /* fake pread or pwrite */
1077 /* We may still think we hold the active lock. */
1078 tdb->num_lockrecs = 0;
1079 SAFE_FREE(tdb->lockrecs);
1081 if (active_lock && tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
1082 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
1093 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
1094 seek pointer from our parent and to re-establish locks */
1095 int tdb_reopen(struct tdb_context *tdb)
1097 return tdb_reopen_internal(tdb, tdb->flags & TDB_CLEAR_IF_FIRST);
1100 /* reopen all tdb's */
1101 int tdb_reopen_all(int parent_longlived)
1103 struct tdb_context *tdb;
1105 for (tdb=tdbs; tdb; tdb = tdb->next) {
1106 bool active_lock = (tdb->flags & TDB_CLEAR_IF_FIRST);
1109 * If the parent is longlived (ie. a
1110 * parent daemon architecture), we know
1111 * it will keep it's active lock on a
1112 * tdb opened with CLEAR_IF_FIRST. Thus
1113 * for child processes we don't have to
1114 * add an active lock. This is essential
1115 * to improve performance on systems that
1116 * keep POSIX locks as a non-scalable data
1117 * structure in the kernel.
1119 if (parent_longlived) {
1120 /* Ensure no clear-if-first. */
1121 active_lock = false;
1124 if (tdb_reopen_internal(tdb, active_lock) != 0)