From: Tim Prouty Date: Tue, 3 Feb 2009 19:56:35 +0000 (-0800) Subject: s3 oplocks: Make the level2 oplock contention API more granular X-Git-Tag: samba-4.0.0alpha7~388 X-Git-Url: http://git.samba.org/?a=commitdiff_plain;h=c6f1f055fdeee29ad7c5b2ae9909e8f1b1a7daec;p=samba.git s3 oplocks: Make the level2 oplock contention API more granular This replaces release_level2_oplocks_on_change with contend_level2_oplock_begin/end in order to contend level2 oplocks throughout an operation rather than just at the begining. This is necessary for some kernel oplock implementations, and also lays the groundwork for better correctness in Samba's standard level2 oplock handling. The next step for non-kernel oplocks is to add additional state to the share mode lock struct that prevents any new opens from granting oplocks while a contending operation is in progress. All operations that contend level 2 oplocks are now correctly spanned except for aio and synchronous writes. The two write paths both have non-trivial error paths that need extra care to get right. RAW-OPLOCK and the rest of 'make test' are still passing with this change. --- diff --git a/source3/include/proto.h b/source3/include/proto.h index 1a1f8bef69e..ceea97bf56c 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -6899,7 +6899,15 @@ void release_file_oplock(files_struct *fsp); bool remove_oplock(files_struct *fsp); bool downgrade_oplock(files_struct *fsp); void reply_to_oplock_break_requests(files_struct *fsp); -void release_level_2_oplocks_on_change(files_struct *fsp); +void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, + void *private_data, + uint32_t msg_type, + struct server_id src, + DATA_BLOB *data); +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type); +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type); void share_mode_entry_to_message(char *msg, const struct share_mode_entry *e); void message_to_share_mode_entry(struct share_mode_entry *e, char *msg); bool init_oplocks(struct messaging_context *msg_ctx); diff --git a/source3/include/smb.h b/source3/include/smb.h index 96cd3b7d853..557489f211e 100644 --- a/source3/include/smb.h +++ b/source3/include/smb.h @@ -1695,6 +1695,16 @@ struct kernel_oplocks { void *private_data; }; +enum level2_contention_type { + LEVEL2_CONTEND_ALLOC_SHRINK, + LEVEL2_CONTEND_ALLOC_GROW, + LEVEL2_CONTEND_SET_FILE_LEN, + LEVEL2_CONTEND_FILL_SPARSE, + LEVEL2_CONTEND_WRITE, + LEVEL2_CONTEND_WINDOWS_BRL, + LEVEL2_CONTEND_POSIX_BRL +}; + /* if a kernel does support oplocks then a structure of the following typee is used to describe how to interact with the kernel */ struct kernel_oplocks_ops { @@ -1702,6 +1712,10 @@ struct kernel_oplocks_ops { files_struct *fsp, int oplock_type); void (*release_oplock)(struct kernel_oplocks *ctx, files_struct *fsp, int oplock_type); + void (*contend_level2_oplocks_begin)(files_struct *fsp, + enum level2_contention_type type); + void (*contend_level2_oplocks_end)(files_struct *fsp, + enum level2_contention_type type); }; #include "smb_macros.h" diff --git a/source3/lib/dummysmbd.c b/source3/lib/dummysmbd.c index 5c624bdebfb..a41e6dc033a 100644 --- a/source3/lib/dummysmbd.c +++ b/source3/lib/dummysmbd.c @@ -66,3 +66,22 @@ struct messaging_context *smbd_messaging_context(void) { return NULL; } + +/** + * The following two functions need to be called from inside the low-level BRL + * code for oplocks correctness in smbd. Since other utility binaries also + * link in some of the brl code directly, these dummy functions are necessary + * to avoid needing to link in the oplocks code and its dependencies to all of + * the utility binaries. + */ +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type) +{ + return; +} + +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type) +{ + return; +} diff --git a/source3/locking/brlock.c b/source3/locking/brlock.c index 032aaa56b61..76496d11fb3 100644 --- a/source3/locking/brlock.c +++ b/source3/locking/brlock.c @@ -311,6 +311,7 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, unsigned int i; files_struct *fsp = br_lck->fsp; struct lock_struct *locks = br_lck->lock_data; + NTSTATUS status; for (i=0; i < br_lck->num_locks; i++) { /* Do any Windows or POSIX locks conflict ? */ @@ -327,6 +328,10 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, #endif } + if (!IS_PENDING_LOCK(plock->lock_type)) { + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WINDOWS_BRL); + } + /* We can get the Windows lock, now see if it needs to be mapped into a lower level POSIX one, and if so can we get it ? */ @@ -346,9 +351,11 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, plock->context.smbpid = 0xFFFFFFFF; if (errno_ret == EACCES || errno_ret == EAGAIN) { - return NT_STATUS_FILE_LOCK_CONFLICT; + status = NT_STATUS_FILE_LOCK_CONFLICT; + goto fail; } else { - return map_nt_error_from_unix(errno); + status = map_nt_error_from_unix(errno); + goto fail; } } } @@ -356,7 +363,8 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, /* no conflicts - add it to the list of locks */ locks = (struct lock_struct *)SMB_REALLOC(locks, (br_lck->num_locks + 1) * sizeof(*locks)); if (!locks) { - return NT_STATUS_NO_MEMORY; + status = NT_STATUS_NO_MEMORY; + goto fail; } memcpy(&locks[br_lck->num_locks], plock, sizeof(struct lock_struct)); @@ -365,6 +373,11 @@ static NTSTATUS brl_lock_windows(struct byte_range_lock *br_lck, br_lck->modified = True; return NT_STATUS_OK; + fail: + if (!IS_PENDING_LOCK(plock->lock_type)) { + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WINDOWS_BRL); + } + return status; } /**************************************************************************** @@ -587,11 +600,13 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, struct byte_range_lock *br_lck, struct lock_struct *plock) { - unsigned int i, count; + unsigned int i, count, posix_count; struct lock_struct *locks = br_lck->lock_data; struct lock_struct *tp; bool lock_was_added = False; bool signal_pending_read = False; + bool break_oplocks = false; + NTSTATUS status; /* No zero-zero locks for POSIX. */ if (plock->start == 0 && plock->size == 0) { @@ -613,7 +628,7 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, return NT_STATUS_NO_MEMORY; } - count = 0; + count = posix_count = 0; for (i=0; i < br_lck->num_locks; i++) { struct lock_struct *curr_lock = &locks[i]; @@ -637,6 +652,8 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, memcpy(&tp[count], curr_lock, sizeof(struct lock_struct)); count++; } else { + unsigned int tmp_count; + /* POSIX conflict semantics are different. */ if (brl_conflict_posix(curr_lock, plock)) { /* Can't block ourselves with POSIX locks. */ @@ -648,10 +665,27 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, } /* Work out overlaps. */ - count += brlock_posix_split_merge(&tp[count], curr_lock, plock, &lock_was_added); + tmp_count += brlock_posix_split_merge(&tp[count], curr_lock, plock, &lock_was_added); + posix_count += tmp_count; + count += tmp_count; } } + /* + * Break oplocks while we hold a brl. Since lock() and unlock() calls + * are not symetric with POSIX semantics, we cannot guarantee our + * contend_level2_oplocks_begin/end calls will be acquired and + * released one-for-one as with Windows semantics. Therefore we only + * call contend_level2_oplocks_begin if this is the first POSIX brl on + * the file. + */ + break_oplocks = (!IS_PENDING_LOCK(plock->lock_type) && + posix_count == 0); + if (break_oplocks) { + contend_level2_oplocks_begin(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + if (!lock_was_added) { memcpy(&tp[count], plock, sizeof(struct lock_struct)); count++; @@ -679,10 +713,12 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, if (errno_ret == EACCES || errno_ret == EAGAIN) { SAFE_FREE(tp); - return NT_STATUS_FILE_LOCK_CONFLICT; + status = NT_STATUS_FILE_LOCK_CONFLICT; + goto fail; } else { SAFE_FREE(tp); - return map_nt_error_from_unix(errno); + status = map_nt_error_from_unix(errno); + goto fail; } } } @@ -690,7 +726,8 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, /* Realloc so we don't leak entries per lock call. */ tp = (struct lock_struct *)SMB_REALLOC(tp, count * sizeof(*locks)); if (!tp) { - return NT_STATUS_NO_MEMORY; + status = NT_STATUS_NO_MEMORY; + goto fail; } br_lck->num_locks = count; SAFE_FREE(br_lck->lock_data); @@ -723,6 +760,12 @@ static NTSTATUS brl_lock_posix(struct messaging_context *msg_ctx, } return NT_STATUS_OK; + fail: + if (break_oplocks) { + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + return status; } /**************************************************************************** @@ -881,6 +924,7 @@ static bool brl_unlock_windows(struct messaging_context *msg_ctx, } } + contend_level2_oplocks_end(br_lck->fsp, LEVEL2_CONTEND_WINDOWS_BRL); return True; } @@ -892,7 +936,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, struct byte_range_lock *br_lck, const struct lock_struct *plock) { - unsigned int i, j, count; + unsigned int i, j, count, posix_count; struct lock_struct *tp; struct lock_struct *locks = br_lck->lock_data; bool overlap_found = False; @@ -919,7 +963,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, return False; } - count = 0; + count = posix_count = 0; for (i = 0; i < br_lck->num_locks; i++) { struct lock_struct *lock = &locks[i]; struct lock_struct tmp_lock[3]; @@ -946,6 +990,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, /* No change in this lock. */ memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct)); count++; + posix_count++; } else { SMB_ASSERT(tmp_lock[0].lock_type == UNLOCK_LOCK); overlap_found = True; @@ -970,6 +1015,7 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, } } count++; + posix_count++; continue; } else { /* tmp_count == 3 - (we split a lock range in two). */ @@ -979,8 +1025,10 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, memcpy(&tp[count], &tmp_lock[0], sizeof(struct lock_struct)); count++; + posix_count++; memcpy(&tp[count], &tmp_lock[2], sizeof(struct lock_struct)); count++; + posix_count++; overlap_found = True; /* Optimisation... */ /* We know we're finished here as we can't overlap any @@ -1024,6 +1072,11 @@ static bool brl_unlock_posix(struct messaging_context *msg_ctx, tp = NULL; } + if (posix_count == 0) { + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } + br_lck->num_locks = count; SAFE_FREE(br_lck->lock_data); locks = tp; @@ -1276,6 +1329,7 @@ void brl_close_fnum(struct messaging_context *msg_ctx, struct lock_struct *locks = br_lck->lock_data; struct server_id pid = procid_self(); bool unlock_individually = False; + bool posix_level2_contention_ended = false; if(lp_posix_locking(fsp->conn->params)) { @@ -1346,8 +1400,17 @@ void brl_close_fnum(struct messaging_context *msg_ctx, if ((lock->lock_flav == WINDOWS_LOCK) && (lock->fnum == fnum)) { del_this_lock = True; num_deleted_windows_locks++; + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_WINDOWS_BRL); } else if (lock->lock_flav == POSIX_LOCK) { del_this_lock = True; + + /* Only end level2 contention once for posix */ + if (!posix_level2_contention_ended) { + posix_level2_contention_ended = true; + contend_level2_oplocks_end(br_lck->fsp, + LEVEL2_CONTEND_POSIX_BRL); + } } } diff --git a/source3/smbd/aio.c b/source3/smbd/aio.c index 75ce07d41a9..6b19e098e5c 100644 --- a/source3/smbd/aio.c +++ b/source3/smbd/aio.c @@ -291,7 +291,9 @@ bool schedule_aio_write_and_X(connection_struct *conn, aio_ex->req = talloc_move(aio_ex, &req); - release_level_2_oplocks_on_change(fsp); + /* This should actually be improved to span the write. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WRITE); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WRITE); if (!write_through && !lp_syncalways(SNUM(fsp->conn)) && fsp->aio_write_behind) { diff --git a/source3/smbd/fileio.c b/source3/smbd/fileio.c index 3e3f0943b3d..a9a97a2d14a 100644 --- a/source3/smbd/fileio.c +++ b/source3/smbd/fileio.c @@ -318,7 +318,9 @@ ssize_t write_file(struct smb_request *req, * the shared memory area whilst doing this. */ - release_level_2_oplocks_on_change(fsp); + /* This should actually be improved to span the write. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_WRITE); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_WRITE); #ifdef WITH_PROFILE if (profile_p && profile_p->writecache_total_writes % 500 == 0) { diff --git a/source3/smbd/oplock.c b/source3/smbd/oplock.c index 0945ac66776..4c84fd41ede 100644 --- a/source3/smbd/oplock.c +++ b/source3/smbd/oplock.c @@ -329,7 +329,7 @@ static void add_oplock_timeout_handler(files_struct *fsp) the client for LEVEL2. *******************************************************************/ -static void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, +void process_oplock_async_level2_break_message(struct messaging_context *msg_ctx, void *private_data, uint32_t msg_type, struct server_id src, @@ -699,7 +699,8 @@ static void process_open_retry_message(struct messaging_context *msg_ctx, none. ****************************************************************************/ -void release_level_2_oplocks_on_change(files_struct *fsp) +static void contend_level2_oplocks_begin_default(files_struct *fsp, + enum level2_contention_type type) { int i; struct share_mode_lock *lck; @@ -799,6 +800,26 @@ void release_level_2_oplocks_on_change(files_struct *fsp) TALLOC_FREE(lck); } +void contend_level2_oplocks_begin(files_struct *fsp, + enum level2_contention_type type) +{ + if (koplocks && koplocks->ops->contend_level2_oplocks_begin) { + koplocks->ops->contend_level2_oplocks_begin(fsp, type); + return; + } + + contend_level2_oplocks_begin_default(fsp, type); +} + +void contend_level2_oplocks_end(files_struct *fsp, + enum level2_contention_type type) +{ + /* Only kernel oplocks implement this so far */ + if (koplocks && koplocks->ops->contend_level2_oplocks_end) { + koplocks->ops->contend_level2_oplocks_end(fsp, type); + } +} + /**************************************************************************** Linearize a share mode entry struct to an internal oplock break message. ****************************************************************************/ diff --git a/source3/smbd/oplock_irix.c b/source3/smbd/oplock_irix.c index 23b2fa90813..bbc9132a08f 100644 --- a/source3/smbd/oplock_irix.c +++ b/source3/smbd/oplock_irix.c @@ -270,8 +270,10 @@ static void irix_oplocks_read_fde_handler(struct event_context *ev, ****************************************************************************/ static const struct kernel_oplocks_ops irix_koplocks = { - .set_oplock = irix_set_kernel_oplock, - .release_oplock = irix_release_kernel_oplock, + .set_oplock = irix_set_kernel_oplock, + .release_oplock = irix_release_kernel_oplock, + .contend_level2_oplocks_begin = NULL, + .contend_level2_oplocks_end = NULL, }; struct kernel_oplocks *irix_init_kernel_oplocks(TALLOC_CTX *mem_ctx) diff --git a/source3/smbd/oplock_linux.c b/source3/smbd/oplock_linux.c index 5840ff0851c..273fbfdc01e 100644 --- a/source3/smbd/oplock_linux.c +++ b/source3/smbd/oplock_linux.c @@ -179,8 +179,10 @@ static bool linux_oplocks_available(void) ****************************************************************************/ static const struct kernel_oplocks_ops linux_koplocks = { - .set_oplock = linux_set_kernel_oplock, - .release_oplock = linux_release_kernel_oplock, + .set_oplock = linux_set_kernel_oplock, + .release_oplock = linux_release_kernel_oplock, + .contend_level2_oplocks_begin = NULL, + .contend_level2_oplocks_end = NULL, }; struct kernel_oplocks *linux_init_kernel_oplocks(TALLOC_CTX *mem_ctx) diff --git a/source3/smbd/reply.c b/source3/smbd/reply.c index 60e2e5dc7a5..151f9d0827d 100644 --- a/source3/smbd/reply.c +++ b/source3/smbd/reply.c @@ -3010,8 +3010,6 @@ void reply_lockread(struct smb_request *req) return; } - release_level_2_oplocks_on_change(fsp); - numtoread = SVAL(req->vwv+1, 0); startpos = IVAL_TO_SMB_OFF_T(req->vwv+2, 0); @@ -4497,8 +4495,6 @@ void reply_lock(struct smb_request *req) return; } - release_level_2_oplocks_on_change(fsp); - count = (uint64_t)IVAL(req->vwv+1, 0); offset = (uint64_t)IVAL(req->vwv+3, 0); @@ -6870,13 +6866,6 @@ void reply_lockingX(struct smb_request *req) } } - /* - * We do this check *after* we have checked this is not a oplock break - * response message. JRA. - */ - - release_level_2_oplocks_on_change(fsp); - if (req->buflen < (num_ulocks + num_locks) * (large_file_format ? 20 : 10)) { reply_nterror(req, NT_STATUS_INVALID_PARAMETER); diff --git a/source3/smbd/vfs.c b/source3/smbd/vfs.c index 515e557f67e..8d82ca550c5 100644 --- a/source3/smbd/vfs.c +++ b/source3/smbd/vfs.c @@ -514,8 +514,6 @@ int vfs_allocate_file_space(files_struct *fsp, uint64_t len) uint64_t space_avail; uint64_t bsize,dfree,dsize; - release_level_2_oplocks_on_change(fsp); - /* * Actually try and commit the space on disk.... */ @@ -541,15 +539,23 @@ int vfs_allocate_file_space(files_struct *fsp, uint64_t len) DEBUG(10,("vfs_allocate_file_space: file %s, shrink. Current size %.0f\n", fsp->fsp_name, (double)st.st_size )); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_ALLOC_SHRINK); + flush_write_cache(fsp, SIZECHANGE_FLUSH); if ((ret = SMB_VFS_FTRUNCATE(fsp, (SMB_OFF_T)len)) != -1) { set_filelen_write_cache(fsp, len); } + + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_ALLOC_SHRINK); + return ret; } /* Grow - we need to test if we have enough space. */ + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_ALLOC_GROW); + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_ALLOC_GROW); + if (!lp_strict_allocate(SNUM(fsp->conn))) return 0; @@ -581,7 +587,8 @@ int vfs_set_filelen(files_struct *fsp, SMB_OFF_T len) { int ret; - release_level_2_oplocks_on_change(fsp); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_SET_FILE_LEN); + DEBUG(10,("vfs_set_filelen: ftruncate %s to len %.0f\n", fsp->fsp_name, (double)len)); flush_write_cache(fsp, SIZECHANGE_FLUSH); if ((ret = SMB_VFS_FTRUNCATE(fsp, len)) != -1) { @@ -592,6 +599,8 @@ int vfs_set_filelen(files_struct *fsp, SMB_OFF_T len) fsp->fsp_name); } + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_SET_FILE_LEN); + return ret; } @@ -613,7 +622,6 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) size_t num_to_write; ssize_t pwrite_ret; - release_level_2_oplocks_on_change(fsp); ret = SMB_VFS_FSTAT(fsp, &st); if (ret == -1) { return ret; @@ -626,13 +634,16 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) DEBUG(10,("vfs_fill_sparse: write zeros in file %s from len %.0f to len %.0f (%.0f bytes)\n", fsp->fsp_name, (double)st.st_size, (double)len, (double)(len - st.st_size))); + contend_level2_oplocks_begin(fsp, LEVEL2_CONTEND_FILL_SPARSE); + flush_write_cache(fsp, SIZECHANGE_FLUSH); if (!sparse_buf) { sparse_buf = SMB_CALLOC_ARRAY(char, SPARSE_BUF_WRITE_SIZE); if (!sparse_buf) { errno = ENOMEM; - return -1; + ret = -1; + goto out; } } @@ -647,17 +658,23 @@ int vfs_fill_sparse(files_struct *fsp, SMB_OFF_T len) if (pwrite_ret == -1) { DEBUG(10,("vfs_fill_sparse: SMB_VFS_PWRITE for file %s failed with error %s\n", fsp->fsp_name, strerror(errno) )); - return -1; + ret = -1; + goto out; } if (pwrite_ret == 0) { - return 0; + ret = 0; + goto out; } total += pwrite_ret; } set_filelen_write_cache(fsp, len); - return 0; + + ret = 0; + out: + contend_level2_oplocks_end(fsp, LEVEL2_CONTEND_FILL_SPARSE); + return ret; } /****************************************************************************