lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb->transaction != NULL) {
  63                 tdb_increment_seqnum_nonblock(tdb);
  64                 return;
  65         }
  66
  67         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  68                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  69                 return;
  70         }
  71
  72         tdb_increment_seqnum_nonblock(tdb);
  73
  74         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  75 }
  76
  77 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  78 {
  79         return memcmp(data.dptr, key.dptr, data.dsize);
  80 }
  81
  82 /* Returns 0 on fail.  On success, return offset of record, and fills
  83    in rec */
  84 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  85                         struct tdb_record *r)
  86 {
  87         tdb_off_t rec_ptr;
  88
  89         /* read in the hash top */
  90         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  91                 return 0;
  92
  93         /* keep looking until we find the right record */
  94         while (rec_ptr) {
  95                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  96                         return 0;
  97
  98                 if (!TDB_DEAD(r) && hash==r->full_hash
  99                     && key.dsize==r->key_len
 100                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
 101                                       r->key_len, tdb_key_compare,
 102                                       NULL) == 0) {
 103                         return rec_ptr;
 104                 }
 105                 /* detect tight infinite loop */
 106                 if (rec_ptr == r->next) {
 107                         tdb->ecode = TDB_ERR_CORRUPT;
 108                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 109                         return 0;
 110                 }
 111                 rec_ptr = r->next;
 112         }
 113         tdb->ecode = TDB_ERR_NOEXIST;
 114         return 0;
 115 }
 116
 117 /* As tdb_find, but if you succeed, keep the lock */
 118 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 119                            struct tdb_record *rec)
 120 {
 121         uint32_t rec_ptr;
 122
 123         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 124                 return 0;
 125         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 126                 tdb_unlock(tdb, BUCKET(hash), locktype);
 127         return rec_ptr;
 128 }
 129
 130 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 131
 132 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
 133 {
 134         TDB_DATA *dbuf = (TDB_DATA *)private_data;
 135
 136         if (dbuf->dsize != data.dsize) {
 137                 return -1;
 138         }
 139         if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
 140                 return -1;
 141         }
 142         return 0;
 143 }
 144
 145 /* update an entry in place - this only works if the new data size
 146    is <= the old data size and the key exists.
 147    on failure return -1.
 148 */
 149 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 150 {
 151         struct tdb_record rec;
 152         tdb_off_t rec_ptr;
 153
 154         /* find entry */
 155         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 156                 return -1;
 157
 158         /* it could be an exact duplicate of what is there - this is
 159          * surprisingly common (eg. with a ldb re-index). */
 160         if (rec.key_len == key.dsize &&
 161             rec.data_len == dbuf.dsize &&
 162             rec.full_hash == hash &&
 163             tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
 164                 return 0;
 165         }
 166
 167         /* must be long enough key, data and tailer */
 168         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 169                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 170                 return -1;
 171         }
 172
 173         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 174                       dbuf.dptr, dbuf.dsize) == -1)
 175                 return -1;
 176
 177         if (dbuf.dsize != rec.data_len) {
 178                 /* update size */
 179                 rec.data_len = dbuf.dsize;
 180                 return tdb_rec_write(tdb, rec_ptr, &rec);
 181         }
 182
 183         return 0;
 184 }
 185
 186 /* find an entry in the database given a key */
 187 /* If an entry doesn't exist tdb_err will be set to
 188  * TDB_ERR_NOEXIST. If a key has no data attached
 189  * then the TDB_DATA will have zero length but
 190  * a non-zero pointer
 191  */
 192 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 193 {
 194         tdb_off_t rec_ptr;
 195         struct tdb_record rec;
 196         TDB_DATA ret;
 197         uint32_t hash;
 198
 199         /* find which hash bucket it is in */
 200         hash = tdb->hash_fn(&key);
 201         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 202                 return tdb_null;
 203
 204         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 205                                   rec.data_len);
 206         ret.dsize = rec.data_len;
 207         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 208         return ret;
 209 }
 210
 211 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 212 {
 213         TDB_DATA ret = _tdb_fetch(tdb, key);
 214
 215         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 216         return ret;
 217 }
 218
 219 /*
 220  * Find an entry in the database and hand the record's data to a parsing
 221  * function. The parsing function is executed under the chain read lock, so it
 222  * should be fast and should not block on other syscalls.
 223  *
 224  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 225  *
 226  * For mmapped tdb's that do not have a transaction open it points the parsing
 227  * function directly at the mmap area, it avoids the malloc/memcpy in this
 228  * case. If a transaction is open or no mmap is available, it has to do
 229  * malloc/read/parse/free.
 230  *
 231  * This is interesting for all readers of potentially large data structures in
 232  * the tdb records, ldb indexes being one example.
 233  *
 234  * Return -1 if the record was not found.
 235  */
 236
 237 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 238                      int (*parser)(TDB_DATA key, TDB_DATA data,
 239                                    void *private_data),
 240                      void *private_data)
 241 {
 242         tdb_off_t rec_ptr;
 243         struct tdb_record rec;
 244         int ret;
 245         uint32_t hash;
 246
 247         /* find which hash bucket it is in */
 248         hash = tdb->hash_fn(&key);
 249
 250         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 251                 /* record not found */
 252                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 253                 tdb->ecode = TDB_ERR_NOEXIST;
 254                 return -1;
 255         }
 256         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 257
 258         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 259                              rec.data_len, parser, private_data);
 260
 261         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 262
 263         return ret;
 264 }
 265
 266 /* check if an entry in the database exists
 267
 268    note that 1 is returned if the key is found and 0 is returned if not found
 269    this doesn't match the conventions in the rest of this module, but is
 270    compatible with gdbm
 271 */
 272 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 273 {
 274         struct tdb_record rec;
 275
 276         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 277                 return 0;
 278         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 279         return 1;
 280 }
 281
 282 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 283 {
 284         uint32_t hash = tdb->hash_fn(&key);
 285         int ret;
 286
 287         ret = tdb_exists_hash(tdb, key, hash);
 288         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 289         return ret;
 290 }
 291
 292 /* actually delete an entry in the database given the offset */
 293 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 294 {
 295         tdb_off_t last_ptr, i;
 296         struct tdb_record lastrec;
 297
 298         if (tdb->read_only || tdb->traverse_read) return -1;
 299
 300         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 301             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 302                 /* Someone traversing here: mark it as dead */
 303                 rec->magic = TDB_DEAD_MAGIC;
 304                 return tdb_rec_write(tdb, rec_ptr, rec);
 305         }
 306         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 307                 return -1;
 308
 309         /* find previous record in hash chain */
 310         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 311                 return -1;
 312         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 313                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 314                         return -1;
 315
 316         /* unlink it: next ptr is at start of record. */
 317         if (last_ptr == 0)
 318                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 319         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 320                 return -1;
 321
 322         /* recover the space */
 323         if (tdb_free(tdb, rec_ptr, rec) == -1)
 324                 return -1;
 325         return 0;
 326 }
 327
 328 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 329 {
 330         int res = 0;
 331         tdb_off_t rec_ptr;
 332         struct tdb_record rec;
 333
 334         /* read in the hash top */
 335         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 336                 return 0;
 337
 338         while (rec_ptr) {
 339                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 340                         return 0;
 341
 342                 if (rec.magic == TDB_DEAD_MAGIC) {
 343                         res += 1;
 344                 }
 345                 rec_ptr = rec.next;
 346         }
 347         return res;
 348 }
 349
 350 /*
 351  * Purge all DEAD records from a hash chain
 352  */
 353 int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 354 {
 355         int res = -1;
 356         struct tdb_record rec;
 357         tdb_off_t rec_ptr;
 358
 359         if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == -1) {
 360                 /*
 361                  * Don't block the freelist if not strictly necessary
 362                  */
 363                 return -1;
 364         }
 365
 366         /* read in the hash top */
 367         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 368                 goto fail;
 369
 370         while (rec_ptr) {
 371                 tdb_off_t next;
 372
 373                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 374                         goto fail;
 375                 }
 376
 377                 next = rec.next;
 378
 379                 if (rec.magic == TDB_DEAD_MAGIC
 380                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 381                         goto fail;
 382                 }
 383                 rec_ptr = next;
 384         }
 385         res = 0;
 386  fail:
 387         tdb_unlock(tdb, -1, F_WRLCK);
 388         return res;
 389 }
 390
 391 /* delete an entry in the database given a key */
 392 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 393 {
 394         tdb_off_t rec_ptr;
 395         struct tdb_record rec;
 396         int ret;
 397
 398         rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
 399         if (rec_ptr == 0) {
 400                 return -1;
 401         }
 402
 403         if (tdb->max_dead_records != 0) {
 404
 405                 uint32_t magic = TDB_DEAD_MAGIC;
 406
 407                 /*
 408                  * Allow for some dead records per hash chain, mainly for
 409                  * tdb's with a very high create/delete rate like locking.tdb.
 410                  */
 411
 412                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 413                         /*
 414                          * Don't let the per-chain freelist grow too large,
 415                          * delete all existing dead records
 416                          */
 417                         tdb_purge_dead(tdb, hash);
 418                 }
 419
 420                 /*
 421                  * Just mark the record as dead.
 422                  */
 423                 ret = tdb_ofs_write(
 424                         tdb, rec_ptr + offsetof(struct tdb_record, magic),
 425                         &magic);
 426         }
 427         else {
 428                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 429         }
 430
 431         if (ret == 0) {
 432                 tdb_increment_seqnum(tdb);
 433         }
 434
 435         if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
 436                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 437         return ret;
 438 }
 439
 440 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 441 {
 442         uint32_t hash = tdb->hash_fn(&key);
 443         int ret;
 444
 445         ret = tdb_delete_hash(tdb, key, hash);
 446         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 447         return ret;
 448 }
 449
 450 /*
 451  * See if we have a dead record around with enough space
 452  */
 453 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 454                         struct tdb_record *r, tdb_len_t length,
 455                         tdb_off_t *p_last_ptr)
 456 {
 457         tdb_off_t rec_ptr, last_ptr;
 458         tdb_off_t best_rec_ptr = 0;
 459         tdb_off_t best_last_ptr = 0;
 460         struct tdb_record best = { .rec_len = UINT32_MAX };
 461
 462         length += sizeof(tdb_off_t); /* tailer */
 463
 464         last_ptr = TDB_HASH_TOP(hash);
 465
 466         /* read in the hash top */
 467         if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
 468                 return 0;
 469
 470         /* keep looking until we find the right record */
 471         while (rec_ptr) {
 472                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 473                         return 0;
 474
 475                 if (TDB_DEAD(r) && (r->rec_len >= length) &&
 476                     (r->rec_len < best.rec_len)) {
 477                         best_rec_ptr = rec_ptr;
 478                         best_last_ptr = last_ptr;
 479                         best = *r;
 480                 }
 481                 last_ptr = rec_ptr;
 482                 rec_ptr = r->next;
 483         }
 484
 485         if (best.rec_len == UINT32_MAX) {
 486                 return 0;
 487         }
 488
 489         *r = best;
 490         *p_last_ptr = best_last_ptr;
 491         return best_rec_ptr;
 492 }
 493
 494 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 495                        TDB_DATA dbuf, int flag, uint32_t hash)
 496 {
 497         struct tdb_record rec;
 498         tdb_off_t rec_ptr;
 499         tdb_len_t rec_len;
 500         int ret = -1;
 501
 502         rec_len = key.dsize + dbuf.dsize;
 503         if ((rec_len < key.dsize) || (rec_len < dbuf.dsize)) {
 504                 tdb->ecode = TDB_ERR_OOM;
 505                 goto fail;
 506         }
 507
 508         /* check for it existing, on insert. */
 509         if (flag == TDB_INSERT) {
 510                 if (tdb_exists_hash(tdb, key, hash)) {
 511                         tdb->ecode = TDB_ERR_EXISTS;
 512                         goto fail;
 513                 }
 514         } else {
 515                 /* first try in-place update, on modify or replace. */
 516                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 517                         goto done;
 518                 }
 519                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 520                     flag == TDB_MODIFY) {
 521                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 522                          we should fail the store */
 523                         goto fail;
 524                 }
 525         }
 526         /* reset the error code potentially set by the tdb_update_hash() */
 527         tdb->ecode = TDB_SUCCESS;
 528
 529         /* delete any existing record - if it doesn't exist we don't
 530            care.  Doing this first reduces fragmentation, and avoids
 531            coalescing with `allocated' block before it's updated. */
 532         if (flag != TDB_INSERT)
 533                 tdb_delete_hash(tdb, key, hash);
 534
 535         /* we have to allocate some space */
 536         rec_ptr = tdb_allocate(tdb, hash, rec_len, &rec);
 537
 538         if (rec_ptr == 0) {
 539                 goto fail;
 540         }
 541
 542         /* Read hash top into next ptr */
 543         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 544                 goto fail;
 545
 546         rec.key_len = key.dsize;
 547         rec.data_len = dbuf.dsize;
 548         rec.full_hash = hash;
 549         rec.magic = TDB_MAGIC;
 550
 551         /* write out and point the top of the hash chain at it */
 552         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 553             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
 554                                        key.dptr, key.dsize) == -1
 555             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
 556                                        dbuf.dptr, dbuf.dsize) == -1
 557             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 558                 /* Need to tdb_unallocate() here */
 559                 goto fail;
 560         }
 561
 562  done:
 563         ret = 0;
 564  fail:
 565         if (ret == 0) {
 566                 tdb_increment_seqnum(tdb);
 567         }
 568         return ret;
 569 }
 570
 571 /* store an element in the database, replacing any existing element
 572    with the same key
 573
 574    return 0 on success, -1 on failure
 575 */
 576 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 577 {
 578         uint32_t hash;
 579         int ret;
 580
 581         if (tdb->read_only || tdb->traverse_read) {
 582                 tdb->ecode = TDB_ERR_RDONLY;
 583                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 584                 return -1;
 585         }
 586
 587         /* find which hash bucket it is in */
 588         hash = tdb->hash_fn(&key);
 589         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 590                 return -1;
 591
 592         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 593         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 594         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 595         return ret;
 596 }
 597
 598 /* Append to an entry. Create if not exist. */
 599 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 600 {
 601         uint32_t hash;
 602         TDB_DATA dbuf;
 603         int ret = -1;
 604
 605         /* find which hash bucket it is in */
 606         hash = tdb->hash_fn(&key);
 607         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 608                 return -1;
 609
 610         dbuf = _tdb_fetch(tdb, key);
 611
 612         if (dbuf.dptr == NULL) {
 613                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 614         } else {
 615                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 616                 unsigned char *new_dptr;
 617
 618                 /* realloc '0' is special: don't do that. */
 619                 if (new_len == 0)
 620                         new_len = 1;
 621                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 622                 if (new_dptr == NULL) {
 623                         free(dbuf.dptr);
 624                 }
 625                 dbuf.dptr = new_dptr;
 626         }
 627
 628         if (dbuf.dptr == NULL) {
 629                 tdb->ecode = TDB_ERR_OOM;
 630                 goto failed;
 631         }
 632
 633         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 634         dbuf.dsize += new_dbuf.dsize;
 635
 636         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 637         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 638
 639 failed:
 640         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 641         SAFE_FREE(dbuf.dptr);
 642         return ret;
 643 }
 644
 645
 646 /*
 647   return the name of the current tdb file
 648   useful for external logging functions
 649 */
 650 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 651 {
 652         return tdb->name;
 653 }
 654
 655 /*
 656   return the underlying file descriptor being used by tdb, or -1
 657   useful for external routines that want to check the device/inode
 658   of the fd
 659 */
 660 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 661 {
 662         return tdb->fd;
 663 }
 664
 665 /*
 666   return the current logging function
 667   useful for external tdb routines that wish to log tdb errors
 668 */
 669 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 670 {
 671         return tdb->log.log_fn;
 672 }
 673
 674
 675 /*
 676   get the tdb sequence number. Only makes sense if the writers opened
 677   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 678   quickly, so it should only be used for a 'has something changed'
 679   test, not for code that relies on the count of the number of changes
 680   made. If you want a counter then use a tdb record.
 681
 682   The aim of this sequence number is to allow for a very lightweight
 683   test of a possible tdb change.
 684 */
 685 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 686 {
 687         tdb_off_t seqnum=0;
 688
 689         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 690         return seqnum;
 691 }
 692
 693 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 694 {
 695         return tdb->hash_size;
 696 }
 697
 698 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 699 {
 700         return tdb->map_size;
 701 }
 702
 703 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 704 {
 705         return tdb->flags;
 706 }
 707
 708 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 709 {
 710         if ((flags & TDB_ALLOW_NESTING) &&
 711             (flags & TDB_DISALLOW_NESTING)) {
 712                 tdb->ecode = TDB_ERR_NESTING;
 713                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 714                         "allow_nesting and disallow_nesting are not allowed together!"));
 715                 return;
 716         }
 717
 718         if (flags & TDB_ALLOW_NESTING) {
 719                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 720         }
 721         if (flags & TDB_DISALLOW_NESTING) {
 722                 tdb->flags &= ~TDB_ALLOW_NESTING;
 723         }
 724
 725         tdb->flags |= flags;
 726 }
 727
 728 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 729 {
 730         if ((flags & TDB_ALLOW_NESTING) &&
 731             (flags & TDB_DISALLOW_NESTING)) {
 732                 tdb->ecode = TDB_ERR_NESTING;
 733                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 734                         "allow_nesting and disallow_nesting are not allowed together!"));
 735                 return;
 736         }
 737
 738         if ((flags & TDB_NOLOCK) &&
 739             (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
 740             (tdb->mutexes == NULL)) {
 741                 tdb->ecode = TDB_ERR_LOCK;
 742                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 743                          "Can not remove NOLOCK flag on mutexed databases"));
 744                 return;
 745         }
 746
 747         if (flags & TDB_ALLOW_NESTING) {
 748                 tdb->flags |= TDB_DISALLOW_NESTING;
 749         }
 750         if (flags & TDB_DISALLOW_NESTING) {
 751                 tdb->flags |= TDB_ALLOW_NESTING;
 752         }
 753
 754         tdb->flags &= ~flags;
 755 }
 756
 757
 758 /*
 759   enable sequence number handling on an open tdb
 760 */
 761 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 762 {
 763         tdb->flags |= TDB_SEQNUM;
 764 }
 765
 766
 767 /*
 768   add a region of the file to the freelist. Length is the size of the region in bytes,
 769   which includes the free list header that needs to be added
 770  */
 771 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 772 {
 773         struct tdb_record rec;
 774         if (length <= sizeof(rec)) {
 775                 /* the region is not worth adding */
 776                 return 0;
 777         }
 778         if (length + offset > tdb->map_size) {
 779                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 780                 return -1;
 781         }
 782         memset(&rec,'\0',sizeof(rec));
 783         rec.rec_len = length - sizeof(rec);
 784         if (tdb_free(tdb, offset, &rec) == -1) {
 785                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 786                 return -1;
 787         }
 788         return 0;
 789 }
 790
 791 /*
 792   wipe the entire database, deleting all records. This can be done
 793   very fast by using a allrecord lock. The entire data portion of the
 794   file becomes a single entry in the freelist.
 795
 796   This code carefully steps around the recovery area, leaving it alone
 797  */
 798 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 799 {
 800         uint32_t i;
 801         tdb_off_t offset = 0;
 802         ssize_t data_len;
 803         tdb_off_t recovery_head;
 804         tdb_len_t recovery_size = 0;
 805
 806         if (tdb_lockall(tdb) != 0) {
 807                 return -1;
 808         }
 809
 810         tdb_trace(tdb, "tdb_wipe_all");
 811
 812         /* see if the tdb has a recovery area, and remember its size
 813            if so. We don't want to lose this as otherwise each
 814            tdb_wipe_all() in a transaction will increase the size of
 815            the tdb by the size of the recovery area */
 816         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 817                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 818                 goto failed;
 819         }
 820
 821         if (recovery_head != 0) {
 822                 struct tdb_record rec;
 823                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 824                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 825                         return -1;
 826                 }
 827                 recovery_size = rec.rec_len + sizeof(rec);
 828         }
 829
 830         /* wipe the hashes */
 831         for (i=0;i<tdb->hash_size;i++) {
 832                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 833                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 834                         goto failed;
 835                 }
 836         }
 837
 838         /* wipe the freelist */
 839         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 840                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 841                 goto failed;
 842         }
 843
 844         /* add all the rest of the file to the freelist, possibly leaving a gap
 845            for the recovery area */
 846         if (recovery_size == 0) {
 847                 /* the simple case - the whole file can be used as a freelist */
 848                 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
 849                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 850                         goto failed;
 851                 }
 852         } else {
 853                 /* we need to add two freelist entries - one on either
 854                    side of the recovery area
 855
 856                    Note that we cannot shift the recovery area during
 857                    this operation. Only the transaction.c code may
 858                    move the recovery area or we risk subtle data
 859                    corruption
 860                 */
 861                 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
 862                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 863                         goto failed;
 864                 }
 865                 /* and the 2nd free list entry after the recovery area - if any */
 866                 data_len = tdb->map_size - (recovery_head+recovery_size);
 867                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 868                         goto failed;
 869                 }
 870         }
 871
 872         tdb_increment_seqnum_nonblock(tdb);
 873
 874         if (tdb_unlockall(tdb) != 0) {
 875                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 876                 goto failed;
 877         }
 878
 879         return 0;
 880
 881 failed:
 882         tdb_unlockall(tdb);
 883         return -1;
 884 }
 885
 886 struct traverse_state {
 887         bool error;
 888         struct tdb_context *dest_db;
 889 };
 890
 891 /*
 892   traverse function for repacking
 893  */
 894 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 895 {
 896         struct traverse_state *state = (struct traverse_state *)private_data;
 897         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 898                 state->error = true;
 899                 return -1;
 900         }
 901         return 0;
 902 }
 903
 904 /*
 905   repack a tdb
 906  */
 907 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 908 {
 909         struct tdb_context *tmp_db;
 910         struct traverse_state state;
 911
 912         tdb_trace(tdb, "tdb_repack");
 913
 914         if (tdb_transaction_start(tdb) != 0) {
 915                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 916                 return -1;
 917         }
 918
 919         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 920         if (tmp_db == NULL) {
 921                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 922                 tdb_transaction_cancel(tdb);
 923                 return -1;
 924         }
 925
 926         state.error = false;
 927         state.dest_db = tmp_db;
 928
 929         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 930                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 931                 tdb_transaction_cancel(tdb);
 932                 tdb_close(tmp_db);
 933                 return -1;
 934         }
 935
 936         if (state.error) {
 937                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 938                 tdb_transaction_cancel(tdb);
 939                 tdb_close(tmp_db);
 940                 return -1;
 941         }
 942
 943         if (tdb_wipe_all(tdb) != 0) {
 944                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 945                 tdb_transaction_cancel(tdb);
 946                 tdb_close(tmp_db);
 947                 return -1;
 948         }
 949
 950         state.error = false;
 951         state.dest_db = tdb;
 952
 953         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 954                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 955                 tdb_transaction_cancel(tdb);
 956                 tdb_close(tmp_db);
 957                 return -1;
 958         }
 959
 960         if (state.error) {
 961                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 962                 tdb_transaction_cancel(tdb);
 963                 tdb_close(tmp_db);
 964                 return -1;
 965         }
 966
 967         tdb_close(tmp_db);
 968
 969         if (tdb_transaction_commit(tdb) != 0) {
 970                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 971                 return -1;
 972         }
 973
 974         return 0;
 975 }
 976
 977 /* Even on files, we can get partial writes due to signals. */
 978 bool tdb_write_all(int fd, const void *buf, size_t count)
 979 {
 980         while (count) {
 981                 ssize_t ret;
 982                 ret = write(fd, buf, count);
 983                 if (ret < 0)
 984                         return false;
 985                 buf = (const char *)buf + ret;
 986                 count -= ret;
 987         }
 988         return true;
 989 }
 990
 991 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
 992 {
 993         tdb_off_t ret = a + b;
 994
 995         if ((ret < a) || (ret < b)) {
 996                 return false;
 997         }
 998         *pret = ret;
 999         return true;
1000 }
1001
1002 #ifdef TDB_TRACE
1003 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1004 {
1005         if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1006                 close(tdb->tracefd);
1007                 tdb->tracefd = -1;
1008         }
1009 }
1010
1011 static void tdb_trace_start(struct tdb_context *tdb)
1012 {
1013         tdb_off_t seqnum=0;
1014         char msg[sizeof(tdb_off_t) * 4 + 1];
1015
1016         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1017         snprintf(msg, sizeof(msg), "%u ", seqnum);
1018         tdb_trace_write(tdb, msg);
1019 }
1020
1021 static void tdb_trace_end(struct tdb_context *tdb)
1022 {
1023         tdb_trace_write(tdb, "\n");
1024 }
1025
1026 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1027 {
1028         char msg[sizeof(ret) * 4 + 4];
1029         snprintf(msg, sizeof(msg), " = %i\n", ret);
1030         tdb_trace_write(tdb, msg);
1031 }
1032
1033 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1034 {
1035         char msg[20 + rec.dsize*2], *p;
1036         unsigned int i;
1037
1038         /* We differentiate zero-length records from non-existent ones. */
1039         if (rec.dptr == NULL) {
1040                 tdb_trace_write(tdb, " NULL");
1041                 return;
1042         }
1043
1044         /* snprintf here is purely cargo-cult programming. */
1045         p = msg;
1046         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1047         for (i = 0; i < rec.dsize; i++)
1048                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1049
1050         tdb_trace_write(tdb, msg);
1051 }
1052
1053 void tdb_trace(struct tdb_context *tdb, const char *op)
1054 {
1055         tdb_trace_start(tdb);
1056         tdb_trace_write(tdb, op);
1057         tdb_trace_end(tdb);
1058 }
1059
1060 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1061 {
1062         char msg[sizeof(tdb_off_t) * 4 + 1];
1063
1064         snprintf(msg, sizeof(msg), "%u ", seqnum);
1065         tdb_trace_write(tdb, msg);
1066         tdb_trace_write(tdb, op);
1067         tdb_trace_end(tdb);
1068 }
1069
1070 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1071                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1072 {
1073         char msg[128];
1074
1075         snprintf(msg, sizeof(msg),
1076                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1077         tdb_trace_start(tdb);
1078         tdb_trace_write(tdb, msg);
1079         tdb_trace_end(tdb);
1080 }
1081
1082 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1083 {
1084         tdb_trace_start(tdb);
1085         tdb_trace_write(tdb, op);
1086         tdb_trace_end_ret(tdb, ret);
1087 }
1088
1089 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1090 {
1091         tdb_trace_start(tdb);
1092         tdb_trace_write(tdb, op);
1093         tdb_trace_write(tdb, " =");
1094         tdb_trace_record(tdb, ret);
1095         tdb_trace_end(tdb);
1096 }
1097
1098 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1099                     TDB_DATA rec)
1100 {
1101         tdb_trace_start(tdb);
1102         tdb_trace_write(tdb, op);
1103         tdb_trace_record(tdb, rec);
1104         tdb_trace_end(tdb);
1105 }
1106
1107 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1108                         TDB_DATA rec, int ret)
1109 {
1110         tdb_trace_start(tdb);
1111         tdb_trace_write(tdb, op);
1112         tdb_trace_record(tdb, rec);
1113         tdb_trace_end_ret(tdb, ret);
1114 }
1115
1116 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1117                            TDB_DATA rec, TDB_DATA ret)
1118 {
1119         tdb_trace_start(tdb);
1120         tdb_trace_write(tdb, op);
1121         tdb_trace_record(tdb, rec);
1122         tdb_trace_write(tdb, " =");
1123         tdb_trace_record(tdb, ret);
1124         tdb_trace_end(tdb);
1125 }
1126
1127 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1128                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1129                              int ret)
1130 {
1131         char msg[1 + sizeof(ret) * 4];
1132
1133         snprintf(msg, sizeof(msg), " %#x", flag);
1134         tdb_trace_start(tdb);
1135         tdb_trace_write(tdb, op);
1136         tdb_trace_record(tdb, rec1);
1137         tdb_trace_record(tdb, rec2);
1138         tdb_trace_write(tdb, msg);
1139         tdb_trace_end_ret(tdb, ret);
1140 }
1141
1142 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1143                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1144 {
1145         tdb_trace_start(tdb);
1146         tdb_trace_write(tdb, op);
1147         tdb_trace_record(tdb, rec1);
1148         tdb_trace_record(tdb, rec2);
1149         tdb_trace_write(tdb, " =");
1150         tdb_trace_record(tdb, ret);
1151         tdb_trace_end(tdb);
1152 }
1153 #endif