lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb->transaction != NULL) {
  63                 tdb_increment_seqnum_nonblock(tdb);
  64                 return;
  65         }
  66
  67         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  68                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  69                 return;
  70         }
  71
  72         tdb_increment_seqnum_nonblock(tdb);
  73
  74         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  75 }
  76
  77 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  78 {
  79         return memcmp(data.dptr, key.dptr, data.dsize);
  80 }
  81
  82 /* Returns 0 on fail.  On success, return offset of record, and fills
  83    in rec */
  84 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  85                         struct tdb_record *r)
  86 {
  87         tdb_off_t rec_ptr;
  88
  89         /* read in the hash top */
  90         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  91                 return 0;
  92
  93         /* keep looking until we find the right record */
  94         while (rec_ptr) {
  95                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  96                         return 0;
  97
  98                 if (!TDB_DEAD(r) && hash==r->full_hash
  99                     && key.dsize==r->key_len
 100                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
 101                                       r->key_len, tdb_key_compare,
 102                                       NULL) == 0) {
 103                         return rec_ptr;
 104                 }
 105                 /* detect tight infinite loop */
 106                 if (rec_ptr == r->next) {
 107                         tdb->ecode = TDB_ERR_CORRUPT;
 108                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 109                         return 0;
 110                 }
 111                 rec_ptr = r->next;
 112         }
 113         tdb->ecode = TDB_ERR_NOEXIST;
 114         return 0;
 115 }
 116
 117 /* As tdb_find, but if you succeed, keep the lock */
 118 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 119                            struct tdb_record *rec)
 120 {
 121         uint32_t rec_ptr;
 122
 123         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 124                 return 0;
 125         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 126                 tdb_unlock(tdb, BUCKET(hash), locktype);
 127         return rec_ptr;
 128 }
 129
 130 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 131
 132 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
 133 {
 134         TDB_DATA *dbuf = (TDB_DATA *)private_data;
 135
 136         if (dbuf->dsize != data.dsize) {
 137                 return -1;
 138         }
 139         if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
 140                 return -1;
 141         }
 142         return 0;
 143 }
 144
 145 /* update an entry in place - this only works if the new data size
 146    is <= the old data size and the key exists.
 147    on failure return -1.
 148 */
 149 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 150 {
 151         struct tdb_record rec;
 152         tdb_off_t rec_ptr;
 153
 154         /* find entry */
 155         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 156                 return -1;
 157
 158         /* it could be an exact duplicate of what is there - this is
 159          * surprisingly common (eg. with a ldb re-index). */
 160         if (rec.key_len == key.dsize &&
 161             rec.data_len == dbuf.dsize &&
 162             rec.full_hash == hash &&
 163             tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
 164                 return 0;
 165         }
 166
 167         /* must be long enough key, data and tailer */
 168         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 169                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 170                 return -1;
 171         }
 172
 173         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 174                       dbuf.dptr, dbuf.dsize) == -1)
 175                 return -1;
 176
 177         if (dbuf.dsize != rec.data_len) {
 178                 /* update size */
 179                 rec.data_len = dbuf.dsize;
 180                 return tdb_rec_write(tdb, rec_ptr, &rec);
 181         }
 182
 183         return 0;
 184 }
 185
 186 /* find an entry in the database given a key */
 187 /* If an entry doesn't exist tdb_err will be set to
 188  * TDB_ERR_NOEXIST. If a key has no data attached
 189  * then the TDB_DATA will have zero length but
 190  * a non-zero pointer
 191  */
 192 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 193 {
 194         tdb_off_t rec_ptr;
 195         struct tdb_record rec;
 196         TDB_DATA ret;
 197         uint32_t hash;
 198
 199         /* find which hash bucket it is in */
 200         hash = tdb->hash_fn(&key);
 201         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 202                 return tdb_null;
 203
 204         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 205                                   rec.data_len);
 206         ret.dsize = rec.data_len;
 207         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 208         return ret;
 209 }
 210
 211 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 212 {
 213         TDB_DATA ret = _tdb_fetch(tdb, key);
 214
 215         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 216         return ret;
 217 }
 218
 219 /*
 220  * Find an entry in the database and hand the record's data to a parsing
 221  * function. The parsing function is executed under the chain read lock, so it
 222  * should be fast and should not block on other syscalls.
 223  *
 224  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 225  *
 226  * For mmapped tdb's that do not have a transaction open it points the parsing
 227  * function directly at the mmap area, it avoids the malloc/memcpy in this
 228  * case. If a transaction is open or no mmap is available, it has to do
 229  * malloc/read/parse/free.
 230  *
 231  * This is interesting for all readers of potentially large data structures in
 232  * the tdb records, ldb indexes being one example.
 233  *
 234  * Return -1 if the record was not found.
 235  */
 236
 237 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 238                      int (*parser)(TDB_DATA key, TDB_DATA data,
 239                                    void *private_data),
 240                      void *private_data)
 241 {
 242         tdb_off_t rec_ptr;
 243         struct tdb_record rec;
 244         int ret;
 245         uint32_t hash;
 246
 247         /* find which hash bucket it is in */
 248         hash = tdb->hash_fn(&key);
 249
 250         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 251                 /* record not found */
 252                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 253                 tdb->ecode = TDB_ERR_NOEXIST;
 254                 return -1;
 255         }
 256         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 257
 258         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 259                              rec.data_len, parser, private_data);
 260
 261         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 262
 263         return ret;
 264 }
 265
 266 /* check if an entry in the database exists
 267
 268    note that 1 is returned if the key is found and 0 is returned if not found
 269    this doesn't match the conventions in the rest of this module, but is
 270    compatible with gdbm
 271 */
 272 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 273 {
 274         struct tdb_record rec;
 275
 276         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 277                 return 0;
 278         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 279         return 1;
 280 }
 281
 282 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 283 {
 284         uint32_t hash = tdb->hash_fn(&key);
 285         int ret;
 286
 287         ret = tdb_exists_hash(tdb, key, hash);
 288         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 289         return ret;
 290 }
 291
 292 /* actually delete an entry in the database given the offset */
 293 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 294 {
 295         tdb_off_t last_ptr, i;
 296         struct tdb_record lastrec;
 297
 298         if (tdb->read_only || tdb->traverse_read) return -1;
 299
 300         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 301             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 302                 /* Someone traversing here: mark it as dead */
 303                 rec->magic = TDB_DEAD_MAGIC;
 304                 return tdb_rec_write(tdb, rec_ptr, rec);
 305         }
 306         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 307                 return -1;
 308
 309         /* find previous record in hash chain */
 310         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 311                 return -1;
 312         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 313                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 314                         return -1;
 315
 316         /* unlink it: next ptr is at start of record. */
 317         if (last_ptr == 0)
 318                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 319         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 320                 return -1;
 321
 322         /* recover the space */
 323         if (tdb_free(tdb, rec_ptr, rec) == -1)
 324                 return -1;
 325         return 0;
 326 }
 327
 328 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 329 {
 330         int res = 0;
 331         tdb_off_t rec_ptr;
 332         struct tdb_record rec;
 333
 334         /* read in the hash top */
 335         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 336                 return 0;
 337
 338         while (rec_ptr) {
 339                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 340                         return 0;
 341
 342                 if (rec.magic == TDB_DEAD_MAGIC) {
 343                         res += 1;
 344                 }
 345                 rec_ptr = rec.next;
 346         }
 347         return res;
 348 }
 349
 350 /*
 351  * Purge all DEAD records from a hash chain
 352  */
 353 int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 354 {
 355         int res = -1;
 356         struct tdb_record rec;
 357         tdb_off_t rec_ptr;
 358
 359         if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == -1) {
 360                 /*
 361                  * Don't block the freelist if not strictly necessary
 362                  */
 363                 return -1;
 364         }
 365
 366         /* read in the hash top */
 367         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 368                 goto fail;
 369
 370         while (rec_ptr) {
 371                 tdb_off_t next;
 372
 373                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 374                         goto fail;
 375                 }
 376
 377                 next = rec.next;
 378
 379                 if (rec.magic == TDB_DEAD_MAGIC
 380                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 381                         goto fail;
 382                 }
 383                 rec_ptr = next;
 384         }
 385         res = 0;
 386  fail:
 387         tdb_unlock(tdb, -1, F_WRLCK);
 388         return res;
 389 }
 390
 391 /* delete an entry in the database given a key */
 392 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 393 {
 394         tdb_off_t rec_ptr;
 395         struct tdb_record rec;
 396         int ret;
 397
 398         rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
 399         if (rec_ptr == 0) {
 400                 return -1;
 401         }
 402
 403         if (tdb->max_dead_records != 0) {
 404
 405                 uint32_t magic = TDB_DEAD_MAGIC;
 406
 407                 /*
 408                  * Allow for some dead records per hash chain, mainly for
 409                  * tdb's with a very high create/delete rate like locking.tdb.
 410                  */
 411
 412                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 413                         /*
 414                          * Don't let the per-chain freelist grow too large,
 415                          * delete all existing dead records
 416                          */
 417                         tdb_purge_dead(tdb, hash);
 418                 }
 419
 420                 /*
 421                  * Just mark the record as dead.
 422                  */
 423                 ret = tdb_ofs_write(
 424                         tdb, rec_ptr + offsetof(struct tdb_record, magic),
 425                         &magic);
 426         }
 427         else {
 428                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 429         }
 430
 431         if (ret == 0) {
 432                 tdb_increment_seqnum(tdb);
 433         }
 434
 435         if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
 436                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 437         return ret;
 438 }
 439
 440 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 441 {
 442         uint32_t hash = tdb->hash_fn(&key);
 443         int ret;
 444
 445         ret = tdb_delete_hash(tdb, key, hash);
 446         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 447         return ret;
 448 }
 449
 450 /*
 451  * See if we have a dead record around with enough space
 452  */
 453 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 454                         struct tdb_record *r, tdb_len_t length,
 455                         tdb_off_t *p_last_ptr)
 456 {
 457         tdb_off_t rec_ptr, last_ptr;
 458         tdb_off_t best_rec_ptr = 0;
 459         tdb_off_t best_last_ptr = 0;
 460         struct tdb_record best = { .rec_len = UINT32_MAX };
 461
 462         length += sizeof(tdb_off_t); /* tailer */
 463
 464         last_ptr = TDB_HASH_TOP(hash);
 465
 466         /* read in the hash top */
 467         if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
 468                 return 0;
 469
 470         /* keep looking until we find the right record */
 471         while (rec_ptr) {
 472                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 473                         return 0;
 474
 475                 if (TDB_DEAD(r) && (r->rec_len >= length) &&
 476                     (r->rec_len < best.rec_len)) {
 477                         best_rec_ptr = rec_ptr;
 478                         best_last_ptr = last_ptr;
 479                         best = *r;
 480                 }
 481                 last_ptr = rec_ptr;
 482                 rec_ptr = r->next;
 483         }
 484
 485         if (best.rec_len == UINT32_MAX) {
 486                 return 0;
 487         }
 488
 489         *r = best;
 490         *p_last_ptr = best_last_ptr;
 491         return best_rec_ptr;
 492 }
 493
 494 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 495                        TDB_DATA dbuf, int flag, uint32_t hash)
 496 {
 497         struct tdb_record rec;
 498         tdb_off_t rec_ptr;
 499         int ret = -1;
 500
 501         /* check for it existing, on insert. */
 502         if (flag == TDB_INSERT) {
 503                 if (tdb_exists_hash(tdb, key, hash)) {
 504                         tdb->ecode = TDB_ERR_EXISTS;
 505                         goto fail;
 506                 }
 507         } else {
 508                 /* first try in-place update, on modify or replace. */
 509                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 510                         goto done;
 511                 }
 512                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 513                     flag == TDB_MODIFY) {
 514                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 515                          we should fail the store */
 516                         goto fail;
 517                 }
 518         }
 519         /* reset the error code potentially set by the tdb_update_hash() */
 520         tdb->ecode = TDB_SUCCESS;
 521
 522         /* delete any existing record - if it doesn't exist we don't
 523            care.  Doing this first reduces fragmentation, and avoids
 524            coalescing with `allocated' block before it's updated. */
 525         if (flag != TDB_INSERT)
 526                 tdb_delete_hash(tdb, key, hash);
 527
 528         /* we have to allocate some space */
 529         rec_ptr = tdb_allocate(tdb, hash, key.dsize + dbuf.dsize, &rec);
 530
 531         if (rec_ptr == 0) {
 532                 goto fail;
 533         }
 534
 535         /* Read hash top into next ptr */
 536         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 537                 goto fail;
 538
 539         rec.key_len = key.dsize;
 540         rec.data_len = dbuf.dsize;
 541         rec.full_hash = hash;
 542         rec.magic = TDB_MAGIC;
 543
 544         /* write out and point the top of the hash chain at it */
 545         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 546             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
 547                                        key.dptr, key.dsize) == -1
 548             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
 549                                        dbuf.dptr, dbuf.dsize) == -1
 550             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 551                 /* Need to tdb_unallocate() here */
 552                 goto fail;
 553         }
 554
 555  done:
 556         ret = 0;
 557  fail:
 558         if (ret == 0) {
 559                 tdb_increment_seqnum(tdb);
 560         }
 561         return ret;
 562 }
 563
 564 /* store an element in the database, replacing any existing element
 565    with the same key
 566
 567    return 0 on success, -1 on failure
 568 */
 569 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 570 {
 571         uint32_t hash;
 572         int ret;
 573
 574         if (tdb->read_only || tdb->traverse_read) {
 575                 tdb->ecode = TDB_ERR_RDONLY;
 576                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 577                 return -1;
 578         }
 579
 580         /* find which hash bucket it is in */
 581         hash = tdb->hash_fn(&key);
 582         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 583                 return -1;
 584
 585         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 586         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 587         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 588         return ret;
 589 }
 590
 591 /* Append to an entry. Create if not exist. */
 592 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 593 {
 594         uint32_t hash;
 595         TDB_DATA dbuf;
 596         int ret = -1;
 597
 598         /* find which hash bucket it is in */
 599         hash = tdb->hash_fn(&key);
 600         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 601                 return -1;
 602
 603         dbuf = _tdb_fetch(tdb, key);
 604
 605         if (dbuf.dptr == NULL) {
 606                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 607         } else {
 608                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 609                 unsigned char *new_dptr;
 610
 611                 /* realloc '0' is special: don't do that. */
 612                 if (new_len == 0)
 613                         new_len = 1;
 614                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 615                 if (new_dptr == NULL) {
 616                         free(dbuf.dptr);
 617                 }
 618                 dbuf.dptr = new_dptr;
 619         }
 620
 621         if (dbuf.dptr == NULL) {
 622                 tdb->ecode = TDB_ERR_OOM;
 623                 goto failed;
 624         }
 625
 626         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 627         dbuf.dsize += new_dbuf.dsize;
 628
 629         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 630         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 631
 632 failed:
 633         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 634         SAFE_FREE(dbuf.dptr);
 635         return ret;
 636 }
 637
 638
 639 /*
 640   return the name of the current tdb file
 641   useful for external logging functions
 642 */
 643 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 644 {
 645         return tdb->name;
 646 }
 647
 648 /*
 649   return the underlying file descriptor being used by tdb, or -1
 650   useful for external routines that want to check the device/inode
 651   of the fd
 652 */
 653 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 654 {
 655         return tdb->fd;
 656 }
 657
 658 /*
 659   return the current logging function
 660   useful for external tdb routines that wish to log tdb errors
 661 */
 662 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 663 {
 664         return tdb->log.log_fn;
 665 }
 666
 667
 668 /*
 669   get the tdb sequence number. Only makes sense if the writers opened
 670   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 671   quickly, so it should only be used for a 'has something changed'
 672   test, not for code that relies on the count of the number of changes
 673   made. If you want a counter then use a tdb record.
 674
 675   The aim of this sequence number is to allow for a very lightweight
 676   test of a possible tdb change.
 677 */
 678 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 679 {
 680         tdb_off_t seqnum=0;
 681
 682         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 683         return seqnum;
 684 }
 685
 686 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 687 {
 688         return tdb->hash_size;
 689 }
 690
 691 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 692 {
 693         return tdb->map_size;
 694 }
 695
 696 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 697 {
 698         return tdb->flags;
 699 }
 700
 701 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 702 {
 703         if ((flags & TDB_ALLOW_NESTING) &&
 704             (flags & TDB_DISALLOW_NESTING)) {
 705                 tdb->ecode = TDB_ERR_NESTING;
 706                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 707                         "allow_nesting and disallow_nesting are not allowed together!"));
 708                 return;
 709         }
 710
 711         if (flags & TDB_ALLOW_NESTING) {
 712                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 713         }
 714         if (flags & TDB_DISALLOW_NESTING) {
 715                 tdb->flags &= ~TDB_ALLOW_NESTING;
 716         }
 717
 718         tdb->flags |= flags;
 719 }
 720
 721 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 722 {
 723         if ((flags & TDB_ALLOW_NESTING) &&
 724             (flags & TDB_DISALLOW_NESTING)) {
 725                 tdb->ecode = TDB_ERR_NESTING;
 726                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 727                         "allow_nesting and disallow_nesting are not allowed together!"));
 728                 return;
 729         }
 730
 731         if ((flags & TDB_NOLOCK) &&
 732             (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
 733             (tdb->mutexes == NULL)) {
 734                 tdb->ecode = TDB_ERR_LOCK;
 735                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 736                          "Can not remove NOLOCK flag on mutexed databases"));
 737                 return;
 738         }
 739
 740         if (flags & TDB_ALLOW_NESTING) {
 741                 tdb->flags |= TDB_DISALLOW_NESTING;
 742         }
 743         if (flags & TDB_DISALLOW_NESTING) {
 744                 tdb->flags |= TDB_ALLOW_NESTING;
 745         }
 746
 747         tdb->flags &= ~flags;
 748 }
 749
 750
 751 /*
 752   enable sequence number handling on an open tdb
 753 */
 754 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 755 {
 756         tdb->flags |= TDB_SEQNUM;
 757 }
 758
 759
 760 /*
 761   add a region of the file to the freelist. Length is the size of the region in bytes,
 762   which includes the free list header that needs to be added
 763  */
 764 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 765 {
 766         struct tdb_record rec;
 767         if (length <= sizeof(rec)) {
 768                 /* the region is not worth adding */
 769                 return 0;
 770         }
 771         if (length + offset > tdb->map_size) {
 772                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 773                 return -1;
 774         }
 775         memset(&rec,'\0',sizeof(rec));
 776         rec.rec_len = length - sizeof(rec);
 777         if (tdb_free(tdb, offset, &rec) == -1) {
 778                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 779                 return -1;
 780         }
 781         return 0;
 782 }
 783
 784 /*
 785   wipe the entire database, deleting all records. This can be done
 786   very fast by using a allrecord lock. The entire data portion of the
 787   file becomes a single entry in the freelist.
 788
 789   This code carefully steps around the recovery area, leaving it alone
 790  */
 791 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 792 {
 793         uint32_t i;
 794         tdb_off_t offset = 0;
 795         ssize_t data_len;
 796         tdb_off_t recovery_head;
 797         tdb_len_t recovery_size = 0;
 798
 799         if (tdb_lockall(tdb) != 0) {
 800                 return -1;
 801         }
 802
 803         tdb_trace(tdb, "tdb_wipe_all");
 804
 805         /* see if the tdb has a recovery area, and remember its size
 806            if so. We don't want to lose this as otherwise each
 807            tdb_wipe_all() in a transaction will increase the size of
 808            the tdb by the size of the recovery area */
 809         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 810                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 811                 goto failed;
 812         }
 813
 814         if (recovery_head != 0) {
 815                 struct tdb_record rec;
 816                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 817                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 818                         return -1;
 819                 }
 820                 recovery_size = rec.rec_len + sizeof(rec);
 821         }
 822
 823         /* wipe the hashes */
 824         for (i=0;i<tdb->hash_size;i++) {
 825                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 826                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 827                         goto failed;
 828                 }
 829         }
 830
 831         /* wipe the freelist */
 832         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 833                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 834                 goto failed;
 835         }
 836
 837         /* add all the rest of the file to the freelist, possibly leaving a gap
 838            for the recovery area */
 839         if (recovery_size == 0) {
 840                 /* the simple case - the whole file can be used as a freelist */
 841                 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
 842                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 843                         goto failed;
 844                 }
 845         } else {
 846                 /* we need to add two freelist entries - one on either
 847                    side of the recovery area
 848
 849                    Note that we cannot shift the recovery area during
 850                    this operation. Only the transaction.c code may
 851                    move the recovery area or we risk subtle data
 852                    corruption
 853                 */
 854                 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
 855                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 856                         goto failed;
 857                 }
 858                 /* and the 2nd free list entry after the recovery area - if any */
 859                 data_len = tdb->map_size - (recovery_head+recovery_size);
 860                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 861                         goto failed;
 862                 }
 863         }
 864
 865         tdb_increment_seqnum_nonblock(tdb);
 866
 867         if (tdb_unlockall(tdb) != 0) {
 868                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 869                 goto failed;
 870         }
 871
 872         return 0;
 873
 874 failed:
 875         tdb_unlockall(tdb);
 876         return -1;
 877 }
 878
 879 struct traverse_state {
 880         bool error;
 881         struct tdb_context *dest_db;
 882 };
 883
 884 /*
 885   traverse function for repacking
 886  */
 887 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 888 {
 889         struct traverse_state *state = (struct traverse_state *)private_data;
 890         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 891                 state->error = true;
 892                 return -1;
 893         }
 894         return 0;
 895 }
 896
 897 /*
 898   repack a tdb
 899  */
 900 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 901 {
 902         struct tdb_context *tmp_db;
 903         struct traverse_state state;
 904
 905         tdb_trace(tdb, "tdb_repack");
 906
 907         if (tdb_transaction_start(tdb) != 0) {
 908                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 909                 return -1;
 910         }
 911
 912         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 913         if (tmp_db == NULL) {
 914                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 915                 tdb_transaction_cancel(tdb);
 916                 return -1;
 917         }
 918
 919         state.error = false;
 920         state.dest_db = tmp_db;
 921
 922         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 923                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 924                 tdb_transaction_cancel(tdb);
 925                 tdb_close(tmp_db);
 926                 return -1;
 927         }
 928
 929         if (state.error) {
 930                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 931                 tdb_transaction_cancel(tdb);
 932                 tdb_close(tmp_db);
 933                 return -1;
 934         }
 935
 936         if (tdb_wipe_all(tdb) != 0) {
 937                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 938                 tdb_transaction_cancel(tdb);
 939                 tdb_close(tmp_db);
 940                 return -1;
 941         }
 942
 943         state.error = false;
 944         state.dest_db = tdb;
 945
 946         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 947                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 948                 tdb_transaction_cancel(tdb);
 949                 tdb_close(tmp_db);
 950                 return -1;
 951         }
 952
 953         if (state.error) {
 954                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 955                 tdb_transaction_cancel(tdb);
 956                 tdb_close(tmp_db);
 957                 return -1;
 958         }
 959
 960         tdb_close(tmp_db);
 961
 962         if (tdb_transaction_commit(tdb) != 0) {
 963                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 964                 return -1;
 965         }
 966
 967         return 0;
 968 }
 969
 970 /* Even on files, we can get partial writes due to signals. */
 971 bool tdb_write_all(int fd, const void *buf, size_t count)
 972 {
 973         while (count) {
 974                 ssize_t ret;
 975                 ret = write(fd, buf, count);
 976                 if (ret < 0)
 977                         return false;
 978                 buf = (const char *)buf + ret;
 979                 count -= ret;
 980         }
 981         return true;
 982 }
 983
 984 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
 985 {
 986         tdb_off_t ret = a + b;
 987
 988         if ((ret < a) || (ret < b)) {
 989                 return false;
 990         }
 991         *pret = ret;
 992         return true;
 993 }
 994
 995 #ifdef TDB_TRACE
 996 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
 997 {
 998         if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
 999                 close(tdb->tracefd);
1000                 tdb->tracefd = -1;
1001         }
1002 }
1003
1004 static void tdb_trace_start(struct tdb_context *tdb)
1005 {
1006         tdb_off_t seqnum=0;
1007         char msg[sizeof(tdb_off_t) * 4 + 1];
1008
1009         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1010         snprintf(msg, sizeof(msg), "%u ", seqnum);
1011         tdb_trace_write(tdb, msg);
1012 }
1013
1014 static void tdb_trace_end(struct tdb_context *tdb)
1015 {
1016         tdb_trace_write(tdb, "\n");
1017 }
1018
1019 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1020 {
1021         char msg[sizeof(ret) * 4 + 4];
1022         snprintf(msg, sizeof(msg), " = %i\n", ret);
1023         tdb_trace_write(tdb, msg);
1024 }
1025
1026 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1027 {
1028         char msg[20 + rec.dsize*2], *p;
1029         unsigned int i;
1030
1031         /* We differentiate zero-length records from non-existent ones. */
1032         if (rec.dptr == NULL) {
1033                 tdb_trace_write(tdb, " NULL");
1034                 return;
1035         }
1036
1037         /* snprintf here is purely cargo-cult programming. */
1038         p = msg;
1039         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1040         for (i = 0; i < rec.dsize; i++)
1041                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1042
1043         tdb_trace_write(tdb, msg);
1044 }
1045
1046 void tdb_trace(struct tdb_context *tdb, const char *op)
1047 {
1048         tdb_trace_start(tdb);
1049         tdb_trace_write(tdb, op);
1050         tdb_trace_end(tdb);
1051 }
1052
1053 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1054 {
1055         char msg[sizeof(tdb_off_t) * 4 + 1];
1056
1057         snprintf(msg, sizeof(msg), "%u ", seqnum);
1058         tdb_trace_write(tdb, msg);
1059         tdb_trace_write(tdb, op);
1060         tdb_trace_end(tdb);
1061 }
1062
1063 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1064                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1065 {
1066         char msg[128];
1067
1068         snprintf(msg, sizeof(msg),
1069                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1070         tdb_trace_start(tdb);
1071         tdb_trace_write(tdb, msg);
1072         tdb_trace_end(tdb);
1073 }
1074
1075 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1076 {
1077         tdb_trace_start(tdb);
1078         tdb_trace_write(tdb, op);
1079         tdb_trace_end_ret(tdb, ret);
1080 }
1081
1082 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1083 {
1084         tdb_trace_start(tdb);
1085         tdb_trace_write(tdb, op);
1086         tdb_trace_write(tdb, " =");
1087         tdb_trace_record(tdb, ret);
1088         tdb_trace_end(tdb);
1089 }
1090
1091 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1092                     TDB_DATA rec)
1093 {
1094         tdb_trace_start(tdb);
1095         tdb_trace_write(tdb, op);
1096         tdb_trace_record(tdb, rec);
1097         tdb_trace_end(tdb);
1098 }
1099
1100 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1101                         TDB_DATA rec, int ret)
1102 {
1103         tdb_trace_start(tdb);
1104         tdb_trace_write(tdb, op);
1105         tdb_trace_record(tdb, rec);
1106         tdb_trace_end_ret(tdb, ret);
1107 }
1108
1109 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1110                            TDB_DATA rec, TDB_DATA ret)
1111 {
1112         tdb_trace_start(tdb);
1113         tdb_trace_write(tdb, op);
1114         tdb_trace_record(tdb, rec);
1115         tdb_trace_write(tdb, " =");
1116         tdb_trace_record(tdb, ret);
1117         tdb_trace_end(tdb);
1118 }
1119
1120 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1121                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1122                              int ret)
1123 {
1124         char msg[1 + sizeof(ret) * 4];
1125
1126         snprintf(msg, sizeof(msg), " %#x", flag);
1127         tdb_trace_start(tdb);
1128         tdb_trace_write(tdb, op);
1129         tdb_trace_record(tdb, rec1);
1130         tdb_trace_record(tdb, rec2);
1131         tdb_trace_write(tdb, msg);
1132         tdb_trace_end_ret(tdb, ret);
1133 }
1134
1135 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1136                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1137 {
1138         tdb_trace_start(tdb);
1139         tdb_trace_write(tdb, op);
1140         tdb_trace_record(tdb, rec1);
1141         tdb_trace_record(tdb, rec2);
1142         tdb_trace_write(tdb, " =");
1143         tdb_trace_record(tdb, ret);
1144         tdb_trace_end(tdb);
1145 }
1146 #endif