2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 2005
7 Copyright (C) Rusty Russell 2010
9 ** NOTE! The following LGPL license applies to the ntdb
10 ** library. This does NOT imply that all of Samba is released
13 This library is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Lesser General Public
15 License as published by the Free Software Foundation; either
16 version 3 of the License, or (at your option) any later version.
18 This library is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Lesser General Public License for more details.
23 You should have received a copy of the GNU Lesser General Public
24 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #define SAFE_FREE(x) do { if ((x) != NULL) {free((void *)x); (x)=NULL;} } while(0)
33 - only allow a single transaction at a time per database. This makes
34 using the transaction API simpler, as otherwise the caller would
35 have to cope with temporary failures in transactions that conflict
36 with other current transactions
38 - keep the transaction recovery information in the same file as the
39 database, using a special 'transaction recovery' record pointed at
40 by the header. This removes the need for extra journal files as
41 used by some other databases
43 - dynamically allocated the transaction recover record, re-using it
44 for subsequent transactions. If a larger record is needed then
45 ntdb_free() the old record to place it on the normal ntdb freelist
46 before allocating the new record
48 - during transactions, keep a linked list of writes all that have
49 been performed by intercepting all ntdb_write() calls. The hooked
50 transaction versions of ntdb_read() and ntdb_write() check this
51 linked list and try to use the elements of the list in preference
54 - don't allow any locks to be held when a transaction starts,
55 otherwise we can end up with deadlock (plus lack of lock nesting
56 in POSIX locks would mean the lock is lost)
58 - if the caller gains a lock during the transaction but doesn't
59 release it then fail the commit
61 - allow for nested calls to ntdb_transaction_start(), re-using the
62 existing transaction record. If the inner transaction is canceled
63 then a subsequent commit will fail
65 - keep a mirrored copy of the ntdb hash chain heads to allow for the
66 fast hash heads scan on traverse, updating the mirrored copy in
67 the transaction version of ntdb_write
69 - allow callers to mix transaction and non-transaction use of ntdb,
70 although once a transaction is started then an exclusive lock is
71 gained until the transaction is committed or canceled
73 - the commit stategy involves first saving away all modified data
74 into a linearised buffer in the transaction recovery area, then
75 marking the transaction recovery area with a magic value to
76 indicate a valid recovery record. In total 4 fsync/msync calls are
77 needed per commit to prevent race conditions. It might be possible
78 to reduce this to 3 or even 2 with some more work.
80 - check for a valid recovery record on open of the ntdb, while the
81 open lock is held. Automatically recover from the transaction
82 recovery area if needed, then continue with the open as
83 usual. This allows for smooth crash recovery with no administrator
86 - if NTDB_NOSYNC is passed to flags in ntdb_open then transactions are
87 still available, but no transaction recovery area is used and no
88 fsync/msync calls are made.
92 hold the context of any current transaction
94 struct ntdb_transaction {
95 /* the original io methods - used to do IOs to the real db */
96 const struct ntdb_methods *io_methods;
98 /* the list of transaction blocks. When a block is first
99 written to, it gets created in this list */
102 size_t last_block_size; /* number of valid bytes in the last block */
104 /* non-zero when an internal transaction error has
105 occurred. All write operations will then fail until the
106 transaction is ended */
107 int transaction_error;
109 /* when inside a transaction we need to keep track of any
110 nested ntdb_transaction_start() calls, as these are allowed,
111 but don't create a new transaction */
112 unsigned int nesting;
114 /* set when a prepare has already occurred */
116 ntdb_off_t magic_offset;
118 /* old file size before transaction */
119 ntdb_len_t old_map_size;
122 /* This doesn't really need to be pagesize, but we use it for similar reasons. */
123 #define PAGESIZE 65536
126 read while in a transaction. We need to check first if the data is in our list
127 of transaction elements, then if not do a real read
129 static enum NTDB_ERROR transaction_read(struct ntdb_context *ntdb, ntdb_off_t off,
130 void *buf, ntdb_len_t len)
133 enum NTDB_ERROR ecode;
135 /* break it down into block sized ops */
136 while (len + (off % PAGESIZE) > PAGESIZE) {
137 ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
138 ecode = transaction_read(ntdb, off, buf, len2);
139 if (ecode != NTDB_SUCCESS) {
144 buf = (void *)(len2 + (char *)buf);
151 blk = off / PAGESIZE;
153 /* see if we have it in the block list */
154 if (ntdb->transaction->num_blocks <= blk ||
155 ntdb->transaction->blocks[blk] == NULL) {
156 /* nope, do a real read */
157 ecode = ntdb->transaction->io_methods->tread(ntdb, off, buf, len);
158 if (ecode != NTDB_SUCCESS) {
164 /* it is in the block list. Now check for the last block */
165 if (blk == ntdb->transaction->num_blocks-1) {
166 if (len > ntdb->transaction->last_block_size) {
172 /* now copy it out of this block */
173 memcpy(buf, ntdb->transaction->blocks[blk] + (off % PAGESIZE), len);
177 ntdb->transaction->transaction_error = 1;
178 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
179 "transaction_read: failed at off=%zu len=%zu",
180 (size_t)off, (size_t)len);
185 write while in a transaction
187 static enum NTDB_ERROR transaction_write(struct ntdb_context *ntdb, ntdb_off_t off,
188 const void *buf, ntdb_len_t len)
191 enum NTDB_ERROR ecode;
193 /* Only a commit is allowed on a prepared transaction */
194 if (ntdb->transaction->prepared) {
195 ecode = ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_ERROR,
196 "transaction_write: transaction already"
197 " prepared, write not allowed");
201 /* break it up into block sized chunks */
202 while (len + (off % PAGESIZE) > PAGESIZE) {
203 ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
204 ecode = transaction_write(ntdb, off, buf, len2);
205 if (ecode != NTDB_SUCCESS) {
211 buf = (const void *)(len2 + (const char *)buf);
219 blk = off / PAGESIZE;
220 off = off % PAGESIZE;
222 if (ntdb->transaction->num_blocks <= blk) {
223 uint8_t **new_blocks;
224 /* expand the blocks array */
225 if (ntdb->transaction->blocks == NULL) {
226 new_blocks = (uint8_t **)malloc(
227 (blk+1)*sizeof(uint8_t *));
229 new_blocks = (uint8_t **)realloc(
230 ntdb->transaction->blocks,
231 (blk+1)*sizeof(uint8_t *));
233 if (new_blocks == NULL) {
234 ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
236 " failed to allocate");
239 memset(&new_blocks[ntdb->transaction->num_blocks], 0,
240 (1+(blk - ntdb->transaction->num_blocks))*sizeof(uint8_t *));
241 ntdb->transaction->blocks = new_blocks;
242 ntdb->transaction->num_blocks = blk+1;
243 ntdb->transaction->last_block_size = 0;
246 /* allocate and fill a block? */
247 if (ntdb->transaction->blocks[blk] == NULL) {
248 ntdb->transaction->blocks[blk] = (uint8_t *)calloc(PAGESIZE, 1);
249 if (ntdb->transaction->blocks[blk] == NULL) {
250 ecode = ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
252 " failed to allocate");
255 if (ntdb->transaction->old_map_size > blk * PAGESIZE) {
256 ntdb_len_t len2 = PAGESIZE;
257 if (len2 + (blk * PAGESIZE) > ntdb->transaction->old_map_size) {
258 len2 = ntdb->transaction->old_map_size - (blk * PAGESIZE);
260 ecode = ntdb->transaction->io_methods->tread(ntdb,
262 ntdb->transaction->blocks[blk],
264 if (ecode != NTDB_SUCCESS) {
265 ecode = ntdb_logerr(ntdb, ecode,
269 " read old block: %s",
271 SAFE_FREE(ntdb->transaction->blocks[blk]);
274 if (blk == ntdb->transaction->num_blocks-1) {
275 ntdb->transaction->last_block_size = len2;
280 /* overwrite part of an existing block */
282 memset(ntdb->transaction->blocks[blk] + off, 0, len);
284 memcpy(ntdb->transaction->blocks[blk] + off, buf, len);
286 if (blk == ntdb->transaction->num_blocks-1) {
287 if (len + off > ntdb->transaction->last_block_size) {
288 ntdb->transaction->last_block_size = len + off;
295 ntdb->transaction->transaction_error = 1;
301 write while in a transaction - this variant never expands the transaction blocks, it only
302 updates existing blocks. This means it cannot change the recovery size
304 static void transaction_write_existing(struct ntdb_context *ntdb, ntdb_off_t off,
305 const void *buf, ntdb_len_t len)
309 /* break it up into block sized chunks */
310 while (len + (off % PAGESIZE) > PAGESIZE) {
311 ntdb_len_t len2 = PAGESIZE - (off % PAGESIZE);
312 transaction_write_existing(ntdb, off, buf, len2);
316 buf = (const void *)(len2 + (const char *)buf);
324 blk = off / PAGESIZE;
325 off = off % PAGESIZE;
327 if (ntdb->transaction->num_blocks <= blk ||
328 ntdb->transaction->blocks[blk] == NULL) {
332 if (blk == ntdb->transaction->num_blocks-1 &&
333 off + len > ntdb->transaction->last_block_size) {
334 if (off >= ntdb->transaction->last_block_size) {
337 len = ntdb->transaction->last_block_size - off;
340 /* overwrite part of an existing block */
341 memcpy(ntdb->transaction->blocks[blk] + off, buf, len);
346 out of bounds check during a transaction
348 static enum NTDB_ERROR transaction_oob(struct ntdb_context *ntdb,
349 ntdb_off_t off, ntdb_len_t len, bool probe)
351 if ((off + len >= off && off + len <= ntdb->file->map_size) || probe) {
355 ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
356 "ntdb_oob len %lld beyond transaction size %lld",
357 (long long)(off + len),
358 (long long)ntdb->file->map_size);
363 transaction version of ntdb_expand().
365 static enum NTDB_ERROR transaction_expand_file(struct ntdb_context *ntdb,
368 enum NTDB_ERROR ecode;
370 /* add a write to the transaction elements, so subsequent
371 reads see the zero data */
372 ecode = transaction_write(ntdb, ntdb->file->map_size, NULL, addition);
373 if (ecode == NTDB_SUCCESS) {
374 ntdb->file->map_size += addition;
379 static void *transaction_direct(struct ntdb_context *ntdb, ntdb_off_t off,
380 size_t len, bool write_mode)
382 size_t blk = off / PAGESIZE, end_blk;
384 /* This is wrong for zero-length blocks, but will fail gracefully */
385 end_blk = (off + len - 1) / PAGESIZE;
387 /* Can only do direct if in single block and we've already copied. */
389 ntdb->stats.transaction_write_direct++;
391 || blk >= ntdb->transaction->num_blocks
392 || ntdb->transaction->blocks[blk] == NULL) {
393 ntdb->stats.transaction_write_direct_fail++;
396 return ntdb->transaction->blocks[blk] + off % PAGESIZE;
399 ntdb->stats.transaction_read_direct++;
400 /* Single which we have copied? */
402 && blk < ntdb->transaction->num_blocks
403 && ntdb->transaction->blocks[blk])
404 return ntdb->transaction->blocks[blk] + off % PAGESIZE;
406 /* Otherwise must be all not copied. */
407 while (blk <= end_blk) {
408 if (blk >= ntdb->transaction->num_blocks)
410 if (ntdb->transaction->blocks[blk]) {
411 ntdb->stats.transaction_read_direct_fail++;
416 return ntdb->transaction->io_methods->direct(ntdb, off, len, false);
419 static const struct ntdb_methods transaction_methods = {
423 transaction_expand_file,
430 static enum NTDB_ERROR transaction_sync(struct ntdb_context *ntdb,
431 ntdb_off_t offset, ntdb_len_t length)
433 if (ntdb->flags & NTDB_NOSYNC) {
437 if (fsync(ntdb->file->fd) != 0) {
438 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
439 "ntdb_transaction: fsync failed: %s",
443 if (ntdb->file->map_ptr) {
444 ntdb_off_t moffset = offset & ~(getpagesize()-1);
445 if (msync(moffset + (char *)ntdb->file->map_ptr,
446 length + (offset - moffset), MS_SYNC) != 0) {
447 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
448 "ntdb_transaction: msync failed: %s",
457 static void _ntdb_transaction_cancel(struct ntdb_context *ntdb)
460 enum NTDB_ERROR ecode;
462 if (ntdb->transaction == NULL) {
463 ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
464 "ntdb_transaction_cancel: no transaction");
468 if (ntdb->transaction->nesting != 0) {
469 ntdb->transaction->transaction_error = 1;
470 ntdb->transaction->nesting--;
474 ntdb->file->map_size = ntdb->transaction->old_map_size;
476 /* free all the transaction blocks */
477 for (i=0;i<ntdb->transaction->num_blocks;i++) {
478 if (ntdb->transaction->blocks[i] != NULL) {
479 free(ntdb->transaction->blocks[i]);
482 SAFE_FREE(ntdb->transaction->blocks);
484 if (ntdb->transaction->magic_offset) {
485 const struct ntdb_methods *methods = ntdb->transaction->io_methods;
486 uint64_t invalid = NTDB_RECOVERY_INVALID_MAGIC;
488 /* remove the recovery marker */
489 ecode = methods->twrite(ntdb, ntdb->transaction->magic_offset,
490 &invalid, sizeof(invalid));
491 if (ecode == NTDB_SUCCESS)
492 ecode = transaction_sync(ntdb,
493 ntdb->transaction->magic_offset,
495 if (ecode != NTDB_SUCCESS) {
496 ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
497 "ntdb_transaction_cancel: failed to remove"
502 if (ntdb->file->allrecord_lock.count)
503 ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
505 /* restore the normal io methods */
506 ntdb->io = ntdb->transaction->io_methods;
508 ntdb_transaction_unlock(ntdb, F_WRLCK);
510 if (ntdb_has_open_lock(ntdb))
511 ntdb_unlock_open(ntdb, F_WRLCK);
513 SAFE_FREE(ntdb->transaction);
517 start a ntdb transaction. No token is returned, as only a single
518 transaction is allowed to be pending per ntdb_context
520 _PUBLIC_ enum NTDB_ERROR ntdb_transaction_start(struct ntdb_context *ntdb)
522 enum NTDB_ERROR ecode;
524 ntdb->stats.transactions++;
525 /* some sanity checks */
526 if (ntdb->flags & NTDB_INTERNAL) {
527 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
528 "ntdb_transaction_start:"
529 " cannot start a transaction on an"
533 if (ntdb->flags & NTDB_RDONLY) {
534 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
535 "ntdb_transaction_start:"
536 " cannot start a transaction on a"
540 /* cope with nested ntdb_transaction_start() calls */
541 if (ntdb->transaction != NULL) {
542 if (!(ntdb->flags & NTDB_ALLOW_NESTING)) {
543 return ntdb_logerr(ntdb, NTDB_ERR_IO,
545 "ntdb_transaction_start:"
546 " already inside transaction");
548 ntdb->transaction->nesting++;
549 ntdb->stats.transaction_nest++;
553 if (ntdb_has_hash_locks(ntdb)) {
554 /* the caller must not have any locks when starting a
555 transaction as otherwise we'll be screwed by lack
556 of nested locks in POSIX */
557 return ntdb_logerr(ntdb, NTDB_ERR_LOCK,
559 "ntdb_transaction_start:"
560 " cannot start a transaction with locks"
564 ntdb->transaction = (struct ntdb_transaction *)
565 calloc(sizeof(struct ntdb_transaction), 1);
566 if (ntdb->transaction == NULL) {
567 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
568 "ntdb_transaction_start:"
572 /* get the transaction write lock. This is a blocking lock. As
573 discussed with Volker, there are a number of ways we could
574 make this async, which we will probably do in the future */
575 ecode = ntdb_transaction_lock(ntdb, F_WRLCK);
576 if (ecode != NTDB_SUCCESS) {
577 SAFE_FREE(ntdb->transaction->blocks);
578 SAFE_FREE(ntdb->transaction);
582 /* get a read lock over entire file. This is upgraded to a write
583 lock during the commit */
584 ecode = ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, true);
585 if (ecode != NTDB_SUCCESS) {
586 goto fail_allrecord_lock;
589 /* make sure we know about any file expansions already done by
591 ntdb->io->oob(ntdb, ntdb->file->map_size, 1, true);
592 ntdb->transaction->old_map_size = ntdb->file->map_size;
594 /* finally hook the io methods, replacing them with
595 transaction specific methods */
596 ntdb->transaction->io_methods = ntdb->io;
597 ntdb->io = &transaction_methods;
601 ntdb_transaction_unlock(ntdb, F_WRLCK);
602 SAFE_FREE(ntdb->transaction->blocks);
603 SAFE_FREE(ntdb->transaction);
609 cancel the current transaction
611 _PUBLIC_ void ntdb_transaction_cancel(struct ntdb_context *ntdb)
613 ntdb->stats.transaction_cancel++;
614 _ntdb_transaction_cancel(ntdb);
618 work out how much space the linearised recovery data will consume (worst case)
620 static ntdb_len_t ntdb_recovery_size(struct ntdb_context *ntdb)
622 ntdb_len_t recovery_size = 0;
626 for (i=0;i<ntdb->transaction->num_blocks;i++) {
627 if (i * PAGESIZE >= ntdb->transaction->old_map_size) {
630 if (ntdb->transaction->blocks[i] == NULL) {
633 recovery_size += 2*sizeof(ntdb_off_t);
634 if (i == ntdb->transaction->num_blocks-1) {
635 recovery_size += ntdb->transaction->last_block_size;
637 recovery_size += PAGESIZE;
641 return recovery_size;
644 static enum NTDB_ERROR ntdb_recovery_area(struct ntdb_context *ntdb,
645 const struct ntdb_methods *methods,
646 ntdb_off_t *recovery_offset,
647 struct ntdb_recovery_record *rec)
649 enum NTDB_ERROR ecode;
651 *recovery_offset = ntdb_read_off(ntdb,
652 offsetof(struct ntdb_header, recovery));
653 if (NTDB_OFF_IS_ERR(*recovery_offset)) {
654 return NTDB_OFF_TO_ERR(*recovery_offset);
657 if (*recovery_offset == 0) {
662 ecode = methods->tread(ntdb, *recovery_offset, rec, sizeof(*rec));
663 if (ecode != NTDB_SUCCESS)
666 ntdb_convert(ntdb, rec, sizeof(*rec));
667 /* ignore invalid recovery regions: can happen in crash */
668 if (rec->magic != NTDB_RECOVERY_MAGIC &&
669 rec->magic != NTDB_RECOVERY_INVALID_MAGIC) {
670 *recovery_offset = 0;
676 static unsigned int same(const unsigned char *new,
677 const unsigned char *old,
682 for (i = 0; i < length; i++) {
683 if (new[i] != old[i])
689 static unsigned int different(const unsigned char *new,
690 const unsigned char *old,
692 unsigned int min_same,
693 unsigned int *samelen)
698 for (i = 0; i < length; i++) {
699 if (new[i] == old[i]) {
702 if (*samelen >= min_same) {
709 if (*samelen < min_same)
711 return length - *samelen;
714 /* Allocates recovery blob, without ntdb_recovery_record at head set up. */
715 static struct ntdb_recovery_record *alloc_recovery(struct ntdb_context *ntdb,
718 struct ntdb_recovery_record *rec;
720 enum NTDB_ERROR ecode;
722 const struct ntdb_methods *old_methods = ntdb->io;
724 rec = malloc(sizeof(*rec) + ntdb_recovery_size(ntdb));
726 ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
727 "transaction_setup_recovery:"
729 return NTDB_ERR_PTR(NTDB_ERR_OOM);
732 /* We temporarily revert to the old I/O methods, so we can use
733 * ntdb_access_read */
734 ntdb->io = ntdb->transaction->io_methods;
736 /* build the recovery data into a single blob to allow us to do a single
737 large write, which should be more efficient */
738 p = (unsigned char *)(rec + 1);
739 for (i=0;i<ntdb->transaction->num_blocks;i++) {
743 const unsigned char *buffer;
745 if (ntdb->transaction->blocks[i] == NULL) {
749 offset = i * PAGESIZE;
751 if (i == ntdb->transaction->num_blocks-1) {
752 length = ntdb->transaction->last_block_size;
755 if (offset >= ntdb->transaction->old_map_size) {
759 if (offset + length > ntdb->file->map_size) {
760 ecode = ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
761 "ntdb_transaction_setup_recovery:"
762 " transaction data over new region"
766 if (offset + length > ntdb->transaction->old_map_size) {
767 /* Short read at EOF. */
768 length = ntdb->transaction->old_map_size - offset;
770 buffer = ntdb_access_read(ntdb, offset, length, false);
771 if (NTDB_PTR_IS_ERR(buffer)) {
772 ecode = NTDB_PTR_ERR(buffer);
776 /* Skip over anything the same at the start. */
777 off = same(ntdb->transaction->blocks[i], buffer, length);
780 while (off < length) {
782 unsigned int samelen;
784 len1 = different(ntdb->transaction->blocks[i] + off,
785 buffer + off, length - off,
786 sizeof(offset) + sizeof(len1) + 1,
789 memcpy(p, &offset, sizeof(offset));
790 memcpy(p + sizeof(offset), &len1, sizeof(len1));
791 ntdb_convert(ntdb, p, sizeof(offset) + sizeof(len1));
792 p += sizeof(offset) + sizeof(len1);
793 memcpy(p, buffer + off, len1);
795 off += len1 + samelen;
796 offset += len1 + samelen;
798 ntdb_access_release(ntdb, buffer);
801 *len = p - (unsigned char *)(rec + 1);
802 ntdb->io = old_methods;
807 ntdb->io = old_methods;
808 return NTDB_ERR_PTR(ecode);
811 static ntdb_off_t create_recovery_area(struct ntdb_context *ntdb,
812 ntdb_len_t rec_length,
813 struct ntdb_recovery_record *rec)
815 ntdb_off_t off, recovery_off;
817 enum NTDB_ERROR ecode;
818 const struct ntdb_methods *methods = ntdb->transaction->io_methods;
820 /* round up to a multiple of page size. Overallocate, since each
821 * such allocation forces us to expand the file. */
822 rec->max_len = ntdb_expand_adjust(ntdb->file->map_size, rec_length);
824 /* Round up to a page. */
825 rec->max_len = ((sizeof(*rec) + rec->max_len + PAGESIZE-1)
829 off = ntdb->file->map_size;
831 /* Restore ->map_size before calling underlying expand_file.
832 Also so that we don't try to expand the file again in the
833 transaction commit, which would destroy the recovery
835 addition = (ntdb->file->map_size - ntdb->transaction->old_map_size) +
836 sizeof(*rec) + rec->max_len;
837 ntdb->file->map_size = ntdb->transaction->old_map_size;
838 ntdb->stats.transaction_expand_file++;
839 ecode = methods->expand_file(ntdb, addition);
840 if (ecode != NTDB_SUCCESS) {
841 ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
842 "ntdb_recovery_allocate:"
843 " failed to create recovery area");
844 return NTDB_ERR_TO_OFF(ecode);
847 /* we have to reset the old map size so that we don't try to
848 expand the file again in the transaction commit, which
849 would destroy the recovery area */
850 ntdb->transaction->old_map_size = ntdb->file->map_size;
852 /* write the recovery header offset and sync - we can sync without a race here
853 as the magic ptr in the recovery record has not been set */
855 ntdb_convert(ntdb, &recovery_off, sizeof(recovery_off));
856 ecode = methods->twrite(ntdb, offsetof(struct ntdb_header, recovery),
857 &recovery_off, sizeof(ntdb_off_t));
858 if (ecode != NTDB_SUCCESS) {
859 ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
860 "ntdb_recovery_allocate:"
861 " failed to write recovery head");
862 return NTDB_ERR_TO_OFF(ecode);
864 transaction_write_existing(ntdb, offsetof(struct ntdb_header, recovery),
871 setup the recovery data that will be used on a crash during commit
873 static enum NTDB_ERROR transaction_setup_recovery(struct ntdb_context *ntdb)
875 ntdb_len_t recovery_size = 0;
876 ntdb_off_t recovery_off = 0;
877 ntdb_off_t old_map_size = ntdb->transaction->old_map_size;
878 struct ntdb_recovery_record *recovery;
879 const struct ntdb_methods *methods = ntdb->transaction->io_methods;
881 enum NTDB_ERROR ecode;
883 recovery = alloc_recovery(ntdb, &recovery_size);
884 if (NTDB_PTR_IS_ERR(recovery))
885 return NTDB_PTR_ERR(recovery);
887 ecode = ntdb_recovery_area(ntdb, methods, &recovery_off, recovery);
893 if (recovery->max_len < recovery_size) {
894 /* Not large enough. Free up old recovery area. */
897 ecode = add_free_record(ntdb, recovery_off,
900 NTDB_LOCK_WAIT, true);
902 if (ecode != NTDB_SUCCESS) {
903 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
904 "ntdb_recovery_allocate:"
905 " failed to free previous"
909 /* Refresh recovery after add_free_record above. */
910 recovery = alloc_recovery(ntdb, &recovery_size);
911 if (NTDB_PTR_IS_ERR(recovery))
912 return NTDB_PTR_ERR(recovery);
915 recovery_off = create_recovery_area(ntdb, recovery_size,
917 if (NTDB_OFF_IS_ERR(recovery_off)) {
919 return NTDB_OFF_TO_ERR(recovery_off);
923 /* Now we know size, convert rec header. */
924 recovery->magic = NTDB_RECOVERY_INVALID_MAGIC;
925 recovery->len = recovery_size;
926 recovery->eof = old_map_size;
927 ntdb_convert(ntdb, recovery, sizeof(*recovery));
929 /* write the recovery data to the recovery area */
930 ecode = methods->twrite(ntdb, recovery_off, recovery,
931 sizeof(*recovery) + recovery_size);
932 if (ecode != NTDB_SUCCESS) {
934 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
935 "ntdb_transaction_setup_recovery:"
936 " failed to write recovery data");
938 transaction_write_existing(ntdb, recovery_off, recovery, recovery_size);
942 /* as we don't have ordered writes, we have to sync the recovery
943 data before we update the magic to indicate that the recovery
945 ecode = transaction_sync(ntdb, recovery_off, recovery_size);
946 if (ecode != NTDB_SUCCESS)
949 magic = NTDB_RECOVERY_MAGIC;
950 ntdb_convert(ntdb, &magic, sizeof(magic));
952 ntdb->transaction->magic_offset
953 = recovery_off + offsetof(struct ntdb_recovery_record, magic);
955 ecode = methods->twrite(ntdb, ntdb->transaction->magic_offset,
956 &magic, sizeof(magic));
957 if (ecode != NTDB_SUCCESS) {
958 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
959 "ntdb_transaction_setup_recovery:"
960 " failed to write recovery magic");
962 transaction_write_existing(ntdb, ntdb->transaction->magic_offset,
963 &magic, sizeof(magic));
965 /* ensure the recovery magic marker is on disk */
966 return transaction_sync(ntdb, ntdb->transaction->magic_offset,
970 static enum NTDB_ERROR _ntdb_transaction_prepare_commit(struct ntdb_context *ntdb)
972 const struct ntdb_methods *methods;
973 enum NTDB_ERROR ecode;
975 if (ntdb->transaction == NULL) {
976 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
977 "ntdb_transaction_prepare_commit:"
981 if (ntdb->transaction->prepared) {
982 _ntdb_transaction_cancel(ntdb);
983 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
984 "ntdb_transaction_prepare_commit:"
985 " transaction already prepared");
988 if (ntdb->transaction->transaction_error) {
989 _ntdb_transaction_cancel(ntdb);
990 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_ERROR,
991 "ntdb_transaction_prepare_commit:"
992 " transaction error pending");
996 if (ntdb->transaction->nesting != 0) {
1000 /* check for a null transaction */
1001 if (ntdb->transaction->blocks == NULL) {
1002 return NTDB_SUCCESS;
1005 methods = ntdb->transaction->io_methods;
1007 /* upgrade the main transaction lock region to a write lock */
1008 ecode = ntdb_allrecord_upgrade(ntdb, NTDB_HASH_LOCK_START);
1009 if (ecode != NTDB_SUCCESS) {
1013 /* get the open lock - this prevents new users attaching to the database
1014 during the commit */
1015 ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
1016 if (ecode != NTDB_SUCCESS) {
1020 /* Since we have whole db locked, we don't need the expansion lock. */
1021 if (!(ntdb->flags & NTDB_NOSYNC)) {
1022 /* Sets up ntdb->transaction->recovery and
1023 * ntdb->transaction->magic_offset. */
1024 ecode = transaction_setup_recovery(ntdb);
1025 if (ecode != NTDB_SUCCESS) {
1030 ntdb->transaction->prepared = true;
1032 /* expand the file to the new size if needed */
1033 if (ntdb->file->map_size != ntdb->transaction->old_map_size) {
1036 add = ntdb->file->map_size - ntdb->transaction->old_map_size;
1037 /* Restore original map size for ntdb_expand_file */
1038 ntdb->file->map_size = ntdb->transaction->old_map_size;
1039 ecode = methods->expand_file(ntdb, add);
1040 if (ecode != NTDB_SUCCESS) {
1045 /* Keep the open lock until the actual commit */
1046 return NTDB_SUCCESS;
1050 prepare to commit the current transaction
1052 _PUBLIC_ enum NTDB_ERROR ntdb_transaction_prepare_commit(struct ntdb_context *ntdb)
1054 return _ntdb_transaction_prepare_commit(ntdb);
1058 commit the current transaction
1060 _PUBLIC_ enum NTDB_ERROR ntdb_transaction_commit(struct ntdb_context *ntdb)
1062 const struct ntdb_methods *methods;
1064 enum NTDB_ERROR ecode;
1066 if (ntdb->transaction == NULL) {
1067 return ntdb_logerr(ntdb, NTDB_ERR_EINVAL, NTDB_LOG_USE_ERROR,
1068 "ntdb_transaction_commit:"
1072 ntdb_trace(ntdb, "ntdb_transaction_commit");
1074 if (ntdb->transaction->nesting != 0) {
1075 ntdb->transaction->nesting--;
1076 return NTDB_SUCCESS;
1079 /* check for a null transaction */
1080 if (ntdb->transaction->blocks == NULL) {
1081 _ntdb_transaction_cancel(ntdb);
1082 return NTDB_SUCCESS;
1085 if (!ntdb->transaction->prepared) {
1086 ecode = _ntdb_transaction_prepare_commit(ntdb);
1087 if (ecode != NTDB_SUCCESS) {
1088 _ntdb_transaction_cancel(ntdb);
1093 methods = ntdb->transaction->io_methods;
1095 /* perform all the writes */
1096 for (i=0;i<ntdb->transaction->num_blocks;i++) {
1100 if (ntdb->transaction->blocks[i] == NULL) {
1104 offset = i * PAGESIZE;
1106 if (i == ntdb->transaction->num_blocks-1) {
1107 length = ntdb->transaction->last_block_size;
1110 ecode = methods->twrite(ntdb, offset,
1111 ntdb->transaction->blocks[i], length);
1112 if (ecode != NTDB_SUCCESS) {
1113 /* we've overwritten part of the data and
1114 possibly expanded the file, so we need to
1115 run the crash recovery code */
1117 ntdb_transaction_recover(ntdb);
1119 _ntdb_transaction_cancel(ntdb);
1123 SAFE_FREE(ntdb->transaction->blocks[i]);
1126 SAFE_FREE(ntdb->transaction->blocks);
1127 ntdb->transaction->num_blocks = 0;
1129 /* ensure the new data is on disk */
1130 ecode = transaction_sync(ntdb, 0, ntdb->file->map_size);
1131 if (ecode != NTDB_SUCCESS) {
1136 TODO: maybe write to some dummy hdr field, or write to magic
1137 offset without mmap, before the last sync, instead of the
1141 /* on some systems (like Linux 2.6.x) changes via mmap/msync
1142 don't change the mtime of the file, this means the file may
1143 not be backed up (as ntdb rounding to block sizes means that
1144 file size changes are quite rare too). The following forces
1145 mtime changes when a transaction completes */
1147 utime(ntdb->name, NULL);
1150 /* use a transaction cancel to free memory and remove the
1151 transaction locks: it "restores" map_size, too. */
1152 ntdb->transaction->old_map_size = ntdb->file->map_size;
1153 _ntdb_transaction_cancel(ntdb);
1155 return NTDB_SUCCESS;
1160 recover from an aborted transaction. Must be called with exclusive
1161 database write access already established (including the open
1162 lock to prevent new processes attaching)
1164 enum NTDB_ERROR ntdb_transaction_recover(struct ntdb_context *ntdb)
1166 ntdb_off_t recovery_head, recovery_eof;
1167 unsigned char *data, *p;
1168 struct ntdb_recovery_record rec;
1169 enum NTDB_ERROR ecode;
1171 /* find the recovery area */
1172 recovery_head = ntdb_read_off(ntdb, offsetof(struct ntdb_header,recovery));
1173 if (NTDB_OFF_IS_ERR(recovery_head)) {
1174 ecode = NTDB_OFF_TO_ERR(recovery_head);
1175 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1176 "ntdb_transaction_recover:"
1177 " failed to read recovery head");
1180 if (recovery_head == 0) {
1181 /* we have never allocated a recovery record */
1182 return NTDB_SUCCESS;
1185 /* read the recovery record */
1186 ecode = ntdb_read_convert(ntdb, recovery_head, &rec, sizeof(rec));
1187 if (ecode != NTDB_SUCCESS) {
1188 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1189 "ntdb_transaction_recover:"
1190 " failed to read recovery record");
1193 if (rec.magic != NTDB_RECOVERY_MAGIC) {
1194 /* there is no valid recovery data */
1195 return NTDB_SUCCESS;
1198 if (ntdb->flags & NTDB_RDONLY) {
1199 return ntdb_logerr(ntdb, NTDB_ERR_CORRUPT, NTDB_LOG_ERROR,
1200 "ntdb_transaction_recover:"
1201 " attempt to recover read only database");
1204 recovery_eof = rec.eof;
1206 data = (unsigned char *)malloc(rec.len);
1208 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
1209 "ntdb_transaction_recover:"
1210 " failed to allocate recovery data");
1213 /* read the full recovery data */
1214 ecode = ntdb->io->tread(ntdb, recovery_head + sizeof(rec), data,
1216 if (ecode != NTDB_SUCCESS) {
1217 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1218 "ntdb_transaction_recover:"
1219 " failed to read recovery data");
1222 /* recover the file data */
1224 while (p+sizeof(ntdb_off_t)+sizeof(ntdb_len_t) < data + rec.len) {
1227 ntdb_convert(ntdb, p, sizeof(ofs) + sizeof(len));
1228 memcpy(&ofs, p, sizeof(ofs));
1229 memcpy(&len, p + sizeof(ofs), sizeof(len));
1230 p += sizeof(ofs) + sizeof(len);
1232 ecode = ntdb->io->twrite(ntdb, ofs, p, len);
1233 if (ecode != NTDB_SUCCESS) {
1235 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1236 "ntdb_transaction_recover:"
1237 " failed to recover %zu bytes"
1239 (size_t)len, (size_t)ofs);
1246 ecode = transaction_sync(ntdb, 0, ntdb->file->map_size);
1247 if (ecode != NTDB_SUCCESS) {
1248 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1249 "ntdb_transaction_recover:"
1250 " failed to sync recovery");
1253 /* if the recovery area is after the recovered eof then remove it */
1254 if (recovery_eof <= recovery_head) {
1255 ecode = ntdb_write_off(ntdb, offsetof(struct ntdb_header,
1258 if (ecode != NTDB_SUCCESS) {
1259 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1260 "ntdb_transaction_recover:"
1261 " failed to remove recovery head");
1265 /* remove the recovery magic */
1266 ecode = ntdb_write_off(ntdb,
1268 + offsetof(struct ntdb_recovery_record, magic),
1269 NTDB_RECOVERY_INVALID_MAGIC);
1270 if (ecode != NTDB_SUCCESS) {
1271 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1272 "ntdb_transaction_recover:"
1273 " failed to remove recovery magic");
1276 ecode = transaction_sync(ntdb, 0, recovery_eof);
1277 if (ecode != NTDB_SUCCESS) {
1278 return ntdb_logerr(ntdb, ecode, NTDB_LOG_ERROR,
1279 "ntdb_transaction_recover:"
1280 " failed to sync2 recovery");
1283 ntdb_logerr(ntdb, NTDB_SUCCESS, NTDB_LOG_WARNING,
1284 "ntdb_transaction_recover: recovered %zu byte database",
1285 (size_t)recovery_eof);
1288 return NTDB_SUCCESS;
1291 ntdb_bool_err ntdb_needs_recovery(struct ntdb_context *ntdb)
1293 ntdb_off_t recovery_head;
1294 struct ntdb_recovery_record rec;
1295 enum NTDB_ERROR ecode;
1297 /* find the recovery area */
1298 recovery_head = ntdb_read_off(ntdb, offsetof(struct ntdb_header,recovery));
1299 if (NTDB_OFF_IS_ERR(recovery_head)) {
1300 return recovery_head;
1303 if (recovery_head == 0) {
1304 /* we have never allocated a recovery record */
1308 /* read the recovery record */
1309 ecode = ntdb_read_convert(ntdb, recovery_head, &rec, sizeof(rec));
1310 if (ecode != NTDB_SUCCESS) {
1311 return NTDB_ERR_TO_OFF(ecode);
1314 return (rec.magic == NTDB_RECOVERY_MAGIC);