s3/smbd: fix deferred open with streams and kernel oplocks
authorRalph Boehme <slow@samba.org>
Tue, 7 Mar 2017 15:27:39 +0000 (16:27 +0100)
committerJeremy Allison <jra@samba.org>
Fri, 10 Mar 2017 02:28:26 +0000 (03:28 +0100)
I noticed smbd can get stuck in an open() call with kernel oplocks
enabled and named streams (provided by vfs_streams_xattr):

- client opens a file and with an exclusive oplock

- client starts writing to the file

- client opens an existing stream of the file

- the smbd process gets stuck in an open()

What happens is:

we had setup a locking.tdb record watch in defer_open(), the watch was
triggered, we reattempted the open and got stuck in a blocking open
because the oplock holder (ourselves) hadn't given up the oplock yet.

Cf e576bf5310bc9de9686a71539e9a1b60b4fba5cc for the commit that added
the kernel oplock retry logic. tldr: with kernel oplocks the first open
is non-blocking, but the second one is blocking.

Detailed analysis follows.

When opening a named stream of a file, Samba internally opens the
underlying "base" file first. This internal open of the basefile suceeds
and does *not* trigger an oplock break (because it is an internal open
that doesn't call open() at all) but it is added as an entry to the
locking.tdb record of the file.

Next, the stream open ends up in streams_xattr where a non-blocking
open() on the base file is called. This open fails with EWOULDBLOCK
because we have another fd with a kernel oplock on the file.

So we call defer_open() which sets up a watch on the locking.tdb record.

In the subsequent error unwinding code in open_file_ntcreate() and
callers we close the internal open file handle of the basefile which
also removes the entry from the locking.tdb record and so *changes the
record*.

This fires the record watch and in the callback defer_open_done() we
don't check whether the condition (oplock gone) we're interested in is
actually met. The callback blindly reschedules the open request with
schedule_deferred_open_message_smb().

schedule_deferred_open_message_smb() schedules an immediate tevent event
which has precedence over the IPC fd events in messaging, so the open is
always (!) reattempted before processing the oplock break message.

As explained above, this second open will be a blocking one so we get
stuck in a blocking open.

It doesn't help to make all opens non-blocking, that would just result
in a busy loop failing the open, as we never process the oplock break
message (remember, schedule_deferred_open_message_smb() used immediate
tevent events).

To fix this we must add some logic to the record watch callback to check
whether the record watch was done for a kernel oplock file and if yes,
check if the oplock state changed. If not, simply reschedule the
deferred open and keep waiting.

This logic is only needed for kernel oplocks, not for Samba-level
oplocks, because there's no risk of deadlocking, the worst that can
happen is a rescheduled open that fails again in the oplock checks and
gets deferred again.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=7537

Signed-off-by: Ralph Boehme <slow@samba.org>
Reviewed-by: Jeremy Allison <jra@samba.org>
source3/smbd/open.c

index 3e590eedec2b909b862f3ce18286ae0059432abe..e220bcbe5924ce1d30fbc7b1fd01983b8d48d583 100644 (file)
@@ -1551,6 +1551,23 @@ static bool delay_for_oplock(files_struct *fsp,
        return delay;
 }
 
+/**
+ * Return lease or oplock state from a share mode
+ **/
+static uint32_t get_lease_type_from_share_mode(const struct share_mode_data *d)
+{
+       uint32_t e_lease_type = 0;
+       uint32_t i;
+
+       for (i=0; i < d->num_share_modes; i++) {
+               struct share_mode_entry *e = &d->share_modes[i];
+
+               e_lease_type |= get_lease_type(d, e);
+       }
+
+       return e_lease_type;
+}
+
 static bool file_has_brlocks(files_struct *fsp)
 {
        struct byte_range_lock *br_lck;
@@ -1963,6 +1980,11 @@ static struct deferred_open_record *deferred_open_record_create(
 struct defer_open_state {
        struct smbXsrv_connection *xconn;
        uint64_t mid;
+       struct file_id file_id;
+       struct timeval request_time;
+       struct timeval timeout;
+       bool kernel_oplock;
+       uint32_t lease_type;
 };
 
 static void defer_open_done(struct tevent_req *req);
@@ -1981,6 +2003,7 @@ static void defer_open(struct share_mode_lock *lck,
                       struct timeval timeout,
                       struct smb_request *req,
                       bool delayed_for_oplocks,
+                      bool kernel_oplock,
                       struct file_id id)
 {
        struct deferred_open_record *open_rec = NULL;
@@ -1992,11 +2015,12 @@ static void defer_open(struct share_mode_lock *lck,
        abs_timeout = timeval_sum(&request_time, &timeout);
 
        DBG_DEBUG("request time [%s] timeout [%s] mid [%" PRIu64 "] "
-                 "delayed_for_oplocks [%s] file_id [%s]\n",
+                 "delayed_for_oplocks [%s] kernel_oplock [%s] file_id [%s]\n",
                  timeval_string(talloc_tos(), &request_time, false),
                  timeval_string(talloc_tos(), &abs_timeout, false),
                  req->mid,
                  delayed_for_oplocks ? "yes" : "no",
+                 kernel_oplock ? "yes" : "no",
                  file_id_string_tos(&id));
 
        open_rec = deferred_open_record_create(delayed_for_oplocks,
@@ -2013,6 +2037,11 @@ static void defer_open(struct share_mode_lock *lck,
        }
        watch_state->xconn = req->xconn;
        watch_state->mid = req->mid;
+       watch_state->file_id = lck->data->id;
+       watch_state->request_time = request_time;
+       watch_state->timeout = timeout;
+       watch_state->kernel_oplock = kernel_oplock;
+       watch_state->lease_type = get_lease_type_from_share_mode(lck->data);
 
        DBG_DEBUG("defering mid %" PRIu64 "\n", req->mid);
 
@@ -2042,8 +2071,12 @@ static void defer_open_done(struct tevent_req *req)
 {
        struct defer_open_state *state = tevent_req_callback_data(
                req, struct defer_open_state);
+       struct tevent_req *watch_req = NULL;
+       struct share_mode_lock *lck = NULL;
+       bool schedule_req = true;
+       struct timeval timeout;
        NTSTATUS status;
-       bool ret;
+       bool ok;
 
        status = dbwrap_watched_watch_recv(req, talloc_tos(), NULL, NULL,
                                          NULL);
@@ -2055,13 +2088,72 @@ static void defer_open_done(struct tevent_req *req)
                 * Even if it failed, retry anyway. TODO: We need a way to
                 * tell a re-scheduled open about that error.
                 */
+               if (NT_STATUS_EQUAL(status, NT_STATUS_IO_TIMEOUT) &&
+                   state->kernel_oplock)
+               {
+                       /*
+                        * If we reschedule but the kernel oplock is still hold
+                        * we would block in the second open as that will be a
+                        * blocking open attempt.
+                        */
+                       exit_server("Kernel oplock holder didn't "
+                                   "respond to break message");
+               }
        }
 
-       DEBUG(10, ("scheduling mid %llu\n", (unsigned long long)state->mid));
+       if (state->kernel_oplock) {
+               lck = get_existing_share_mode_lock(talloc_tos(), state->file_id);
+               if (lck != NULL) {
+                       uint32_t lease_type;
+
+                       lease_type = get_lease_type_from_share_mode(lck->data);
 
-       ret = schedule_deferred_open_message_smb(state->xconn, state->mid);
-       SMB_ASSERT(ret);
-       TALLOC_FREE(state);
+                       if ((lease_type != 0) &&
+                           (lease_type == state->lease_type))
+                       {
+                               DBG_DEBUG("Unchanged lease: %" PRIu32 "\n",
+                                         lease_type);
+                               schedule_req = false;
+                       }
+               }
+       }
+
+       if (schedule_req) {
+               DBG_DEBUG("scheduling mid %" PRIu64 "\n", state->mid);
+
+               ok = schedule_deferred_open_message_smb(state->xconn,
+                                                       state->mid);
+               if (!ok) {
+                       exit_server("schedule_deferred_open_message_smb failed");
+               }
+               TALLOC_FREE(lck);
+               TALLOC_FREE(state);
+               return;
+       }
+
+       DBG_DEBUG("Keep waiting for oplock release for [%s/%s%s] "
+                 "mid: %" PRIu64 "\n",
+                 lck->data->servicepath,
+                 lck->data->base_name,
+                 lck->data->stream_name ? lck->data->stream_name : "",
+                 state->mid);
+
+       watch_req = dbwrap_watched_watch_send(state,
+                                             state->xconn->ev_ctx,
+                                             lck->data->record,
+                                             (struct server_id){0});
+       if (watch_req == NULL) {
+               exit_server("Could not watch share mode record");
+       }
+       tevent_req_set_callback(watch_req, defer_open_done, state);
+
+       timeout = timeval_sum(&state->request_time, &state->timeout);
+       ok = tevent_req_set_endtime(watch_req, state->xconn->ev_ctx, timeout);
+       if (!ok) {
+               exit_server("tevent_req_set_endtime failed");
+       }
+
+       TALLOC_FREE(lck);
 }
 
 /**
@@ -2212,7 +2304,8 @@ static NTSTATUS fcb_or_dos_open(struct smb_request *req,
 static void schedule_defer_open(struct share_mode_lock *lck,
                                struct file_id id,
                                struct timeval request_time,
-                               struct smb_request *req)
+                               struct smb_request *req,
+                               bool kernel_oplock)
 {
        /* This is a relative time, added to the absolute
           request_time value to get the absolute timeout time.
@@ -2236,7 +2329,7 @@ static void schedule_defer_open(struct share_mode_lock *lck,
                return;
        }
 
-       defer_open(lck, request_time, timeout, req, true, id);
+       defer_open(lck, request_time, timeout, req, true, kernel_oplock, id);
 }
 
 /****************************************************************************
@@ -2922,7 +3015,7 @@ static NTSTATUS open_file_ntcreate(connection_struct *conn,
                                         first_open_attempt);
                if (delay) {
                        schedule_defer_open(lck, fsp->file_id, request_time,
-                                           req);
+                                           req, true);
                        TALLOC_FREE(lck);
                        DEBUG(10, ("Sent oplock break request to kernel "
                                   "oplock holder\n"));
@@ -3055,7 +3148,7 @@ static NTSTATUS open_file_ntcreate(connection_struct *conn,
                                         first_open_attempt);
                if (delay) {
                        schedule_defer_open(lck, fsp->file_id,
-                                           request_time, req);
+                                           request_time, req, false);
                        TALLOC_FREE(lck);
                        fd_close(fsp);
                        return NT_STATUS_SHARING_VIOLATION;
@@ -3159,7 +3252,7 @@ static NTSTATUS open_file_ntcreate(connection_struct *conn,
 
                        if (!request_timed_out(request_time, timeout)) {
                                defer_open(lck, request_time, timeout, req,
-                                          false, id);
+                                          false, false, id);
                        }
                }