From 3034a5a62b8eaac7bdc52bfb7af1beb29e888b34 Mon Sep 17 00:00:00 2001 From: Volker Lendecke Date: Tue, 18 Mar 2014 08:04:42 +0100 Subject: [PATCH] tdb: Reduce freelist contention In a metadata-intensive benchmark we have seen the locking.tdb freelist to be one of the central contention points. This patch removes most of the contention on the freelist. Ages ago we already reduced freelist contention by using the even much older DEAD records: If TDB_VOLATILE is set, don't directly put deleted records on the freelist, but just mark a few of them just as DEAD. The next new record can them re-use that space without consulting the freelist. This patch builds upon the DEAD records: If we need space and the freelist is busy, instead of doing a blocking wait on the freelist, start looking into other chains for DEAD records and steal them from there. This way every hash chain becomes a small freelist. Just wander around the hash chains as long as the freelist is still busy. With this patch and the tdb mutex patch (following hopefully some time soon) you can see a heavily busy clustered smbd run without locking.tdb futex syscalls. Signed-off-by: Volker Lendecke Reviewed-by: Michael Adam Reviewed-by: Stefan Metzmacher --- lib/tdb/common/freelist.c | 100 +++++++++++++++++++++++++++++++---- lib/tdb/common/tdb.c | 20 +------ lib/tdb/common/tdb_private.h | 3 +- 3 files changed, 93 insertions(+), 30 deletions(-) diff --git a/lib/tdb/common/freelist.c b/lib/tdb/common/freelist.c index ea14dd02ab..6f8f812d41 100644 --- a/lib/tdb/common/freelist.c +++ b/lib/tdb/common/freelist.c @@ -273,7 +273,8 @@ static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, 0 is returned if the space could not be allocated */ -tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec) +static tdb_off_t tdb_allocate_from_freelist( + struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec) { tdb_off_t rec_ptr, last_ptr, newrec_ptr; struct { @@ -282,9 +283,6 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec } bestfit; float multiplier = 1.0; - if (tdb_lock(tdb, -1, F_WRLCK) == -1) - return 0; - /* over-allocate to reduce fragmentation */ length *= 1.25; @@ -297,7 +295,7 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec /* read in the freelist top */ if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) - goto fail; + return 0; bestfit.rec_ptr = 0; bestfit.last_ptr = 0; @@ -310,7 +308,7 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec */ while (rec_ptr) { if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) { - goto fail; + return 0; } if (rec->rec_len >= length) { @@ -344,12 +342,11 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec if (bestfit.rec_ptr != 0) { if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { - goto fail; + return 0; } newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr); - tdb_unlock(tdb, -1, F_WRLCK); return newrec_ptr; } @@ -357,12 +354,95 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_rec database and if we can then try again */ if (tdb_expand(tdb, length + sizeof(*rec)) == 0) goto again; - fail: - tdb_unlock(tdb, -1, F_WRLCK); + return 0; } +static bool tdb_alloc_dead( + struct tdb_context *tdb, int hash, tdb_len_t length, + tdb_off_t *rec_ptr, struct tdb_record *rec) +{ + tdb_off_t last_ptr; + + *rec_ptr = tdb_find_dead(tdb, hash, rec, length, &last_ptr); + if (*rec_ptr == 0) { + return false; + } + /* + * Unlink the record from the hash chain, it's about to be moved into + * another one. + */ + return (tdb_ofs_write(tdb, last_ptr, &rec->next) == 0); +} + +/* + * Chain "hash" is assumed to be locked + */ + +tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length, + struct tdb_record *rec) +{ + tdb_off_t ret; + int i; + + if (tdb->max_dead_records == 0) { + /* + * No dead records to expect anywhere. Do the blocking + * freelist lock without trying to steal from others + */ + goto blocking_freelist_allocate; + } + + /* + * The following loop tries to get the freelist lock nonblocking. If + * it gets the lock, allocate from there. If the freelist is busy, + * instead of waiting we try to steal dead records from other hash + * chains. + * + * Be aware that we do nonblocking locks on the other hash chains as + * well and fail gracefully. This way we avoid deadlocks (we block two + * hash chains, something which is pretty bad normally) + */ + + for (i=1; ihash_size; i++) { + + int list; + + if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == 0) { + /* + * Under the freelist lock take the chance to give + * back our dead records. + */ + tdb_purge_dead(tdb, hash); + + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; + } + + list = BUCKET(hash+i); + + if (tdb_lock_nonblock(tdb, list, F_WRLCK) == 0) { + bool got_dead; + got_dead = tdb_alloc_dead(tdb, list, length, &ret, rec); + tdb_unlock(tdb, list, F_WRLCK); + + if (got_dead) { + return ret; + } + } + } + +blocking_freelist_allocate: + + if (tdb_lock(tdb, -1, F_WRLCK) == -1) { + return 0; + } + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; +} /* return the size of the freelist - used to decide if we should repack diff --git a/lib/tdb/common/tdb.c b/lib/tdb/common/tdb.c index a7111dea96..ba1c98edbe 100644 --- a/lib/tdb/common/tdb.c +++ b/lib/tdb/common/tdb.c @@ -550,26 +550,8 @@ static int _tdb_store(struct tdb_context *tdb, TDB_DATA key, } } - /* - * We have to allocate some space from the freelist, so this means we - * have to lock it. Use the chance to purge all the DEAD records from - * the hash chain under the freelist lock. - */ - - if (tdb_lock(tdb, -1, F_WRLCK) == -1) { - goto fail; - } - - if ((tdb->max_dead_records != 0) - && (tdb_purge_dead(tdb, hash) == -1)) { - tdb_unlock(tdb, -1, F_WRLCK); - goto fail; - } - /* we have to allocate some space */ - rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec); - - tdb_unlock(tdb, -1, F_WRLCK); + rec_ptr = tdb_allocate(tdb, hash, key.dsize + dbuf.dsize, &rec); if (rec_ptr == 0) { goto fail; diff --git a/lib/tdb/common/tdb_private.h b/lib/tdb/common/tdb_private.h index f62c0a3a6f..a672159578 100644 --- a/lib/tdb/common/tdb_private.h +++ b/lib/tdb/common/tdb_private.h @@ -255,7 +255,8 @@ int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); void *tdb_convert(void *buf, uint32_t size); int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec); -tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec); +tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length, + struct tdb_record *rec); int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d); int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off); -- 2.34.1