2 * Unix SMB/CIFS implementation.
3 * Support for OneFS system interfaces.
5 * Copyright (C) Tim Prouty, 2008
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "onefs_config.h"
24 #include "oplock_onefs.h"
26 #include <ifs/ifs_syscalls.h>
27 #include <isi_acl/isi_acl_util.h>
28 #include <sys/isi_acl.h>
31 * Initialize the sm_lock struct before passing it to ifs_createfile.
33 static void smlock_init(connection_struct *conn, struct sm_lock *sml,
34 bool isexe, uint32_t access_mask, uint32_t share_access,
35 uint32_t create_options)
37 sml->sm_type.doc = false;
38 sml->sm_type.isexe = isexe;
39 sml->sm_type.statonly = is_stat_open(access_mask);
40 sml->sm_type.access_mask = access_mask;
41 sml->sm_type.share_access = share_access;
44 * private_options was previously used for DENY_DOS/DENY_FCB checks in
45 * the kernel, but are now properly handled by fcb_or_dos_open. In
46 * these cases, ifs_createfile will return a sharing violation, which
47 * gives fcb_or_dos_open the chance to open a duplicate file handle.
49 sml->sm_type.private_options = 0;
51 /* 1 second delay is handled in onefs_open.c by deferring the open */
52 sml->sm_timeout = timeval_set(0, 0);
55 static void smlock_dump(int debuglevel, const struct sm_lock *sml)
58 DEBUG(debuglevel, ("sml == NULL\n"));
63 ("smlock: doc=%s, isexec=%s, statonly=%s, access_mask=0x%x, "
64 "share_access=0x%x, private_options=0x%x timeout=%d/%d\n",
65 sml->sm_type.doc ? "True" : "False",
66 sml->sm_type.isexe ? "True" : "False",
67 sml->sm_type.statonly ? "True" : "False",
68 sml->sm_type.access_mask,
69 sml->sm_type.share_access,
70 sml->sm_type.private_options,
71 (int)sml->sm_timeout.tv_sec,
72 (int)sml->sm_timeout.tv_usec));
76 * External interface to ifs_createfile
78 int onefs_sys_create_file(connection_struct *conn,
82 uint32_t open_access_mask,
83 uint32_t share_access,
84 uint32_t create_options,
89 struct security_descriptor *sd,
93 struct sm_lock sml, *psml = NULL;
94 enum oplock_type onefs_oplock;
95 enum oplock_type onefs_granted_oplock = OPLOCK_NONE;
96 struct ifs_security_descriptor ifs_sd = {}, *pifs_sd = NULL;
97 uint32_t sec_info_effective = 0;
99 uint32_t onefs_dos_attributes;
100 struct ifs_createfile_flags cf_flags = CF_FLAGS_NONE;
101 char *mapped_name = NULL;
104 START_PROFILE(syscall_createfile);
106 /* Translate the name to UNIX before calling ifs_createfile */
107 mapped_name = talloc_strdup(talloc_tos(), path);
108 if (mapped_name == NULL) {
112 result = SMB_VFS_TRANSLATE_NAME(conn, &mapped_name,
113 vfs_translate_to_unix);
114 if (!NT_STATUS_IS_OK(result)) {
118 /* Setup security descriptor and get secinfo. */
121 uint32_t sec_info_sent = 0;
123 sec_info_sent = (get_sec_info(sd) & IFS_SEC_INFO_KNOWN_MASK);
125 status = onefs_samba_sd_to_sd(sec_info_sent, sd, &ifs_sd,
126 SNUM(conn), &sec_info_effective);
128 if (!NT_STATUS_IS_OK(status)) {
129 DEBUG(1, ("SD initialization failure: %s\n",
138 /* Stripping off private bits will be done for us. */
139 onefs_oplock = onefs_samba_oplock_to_oplock(oplock_request);
141 if (!lp_oplocks(SNUM(conn))) {
142 SMB_ASSERT(onefs_oplock == OPLOCK_NONE);
145 /* Convert samba dos flags to UF_DOS_* attributes. */
146 onefs_dos_attributes = dos_attributes_to_stat_dos_flags(dos_flags);
149 * Deal with kernel creating Default ACLs. (Isilon bug 47447.)
151 * 1) "nt acl support = no", default_acl = no
152 * 2) "inherit permissions = yes", default_acl = no
154 if (lp_nt_acl_support(SNUM(conn)) && !lp_inherit_perms(SNUM(conn)))
155 cf_flags = cf_flags_or(cf_flags, CF_FLAGS_DEFAULT_ACL);
158 * Some customer workflows require the execute bit to be ignored.
160 if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
161 PARM_ALLOW_EXECUTE_ALWAYS,
162 PARM_ALLOW_EXECUTE_ALWAYS_DEFAULT) &&
163 (open_access_mask & FILE_EXECUTE)) {
165 DEBUG(3, ("Stripping execute bit from %s: (0x%x)\n", mapped_name,
169 open_access_mask &= ~FILE_EXECUTE;
172 * Add READ_DATA, so we're not left with desired_access=0. An
173 * execute call should imply the client will read the data.
175 open_access_mask |= FILE_READ_DATA;
177 DEBUGADD(3, ("New stripped access mask: 0x%x\n",
181 DEBUG(10,("onefs_sys_create_file: base_fd = %d, fname = %s "
182 "open_access_mask = 0x%x, flags = 0x%x, mode = 0%o, "
183 "desired_oplock = %s, id = 0x%x, secinfo = 0x%x, sd = %p, "
184 "dos_attributes = 0x%x, path = %s, "
185 "default_acl=%s\n", base_fd, mapped_name,
186 (unsigned int)open_access_mask,
189 onefs_oplock_str(onefs_oplock),
191 sec_info_effective, sd,
192 (unsigned int)onefs_dos_attributes, mapped_name,
193 cf_flags_and_bool(cf_flags, CF_FLAGS_DEFAULT_ACL) ?
196 /* Initialize smlock struct for files/dirs but not internal opens */
197 if (!(oplock_request & INTERNAL_OPEN_ONLY)) {
198 smlock_init(conn, &sml, is_executable(mapped_name), access_mask,
199 share_access, create_options);
203 smlock_dump(10, psml);
205 ret_fd = ifs_createfile(base_fd, mapped_name,
206 (enum ifs_ace_rights)open_access_mask, flags & ~O_ACCMODE, mode,
207 onefs_oplock, id, psml, sec_info_effective, pifs_sd,
208 onefs_dos_attributes, cf_flags, &onefs_granted_oplock);
210 DEBUG(10,("onefs_sys_create_file(%s): ret_fd = %d, "
211 "onefs_granted_oplock = %s\n",
212 ret_fd < 0 ? strerror(errno) : "success", ret_fd,
213 onefs_oplock_str(onefs_granted_oplock)));
215 if (granted_oplock) {
217 onefs_oplock_to_samba_oplock(onefs_granted_oplock);
221 END_PROFILE(syscall_createfile);
222 aclu_free_sd(pifs_sd, false);
223 TALLOC_FREE(mapped_name);
229 * FreeBSD based sendfile implementation that allows for atomic semantics.
231 static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
232 const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
244 hdr.headers = &hdtrl;
249 /* Set up the header iovec. */
251 hdtrl.iov_base = (void *)header->data;
252 hdtrl.iov_len = hdr_len = header->length;
254 hdtrl.iov_base = NULL;
259 while (total + hdtrl.iov_len) {
264 * FreeBSD sendfile returns 0 on success, -1 on error.
265 * Remember, the tofd and fromfd are reversed..... :-).
266 * nwritten includes the header data sent.
270 ret = sendfile(fromfd, tofd, offset, total, &hdr,
272 #if defined(EWOULDBLOCK)
273 } while (ret == -1 && (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK));
275 } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
278 /* On error we're done. */
284 * If this was an ATOMIC sendfile, nwritten doesn't
285 * necessarily indicate an error. It could mean count > than
286 * what sendfile can handle atomically (usually 64K) or that
287 * there was a short read due to the file being truncated.
290 return atomic ? 0 : -1;
294 * An atomic sendfile should never send partial data!
296 if (atomic && nwritten != total + hdtrl.iov_len) {
297 DEBUG(0,("Atomic sendfile() sent partial data: "
298 "%llu of %d\n", nwritten,
299 total + hdtrl.iov_len));
304 * If this was a short (signal interrupted) write we may need
305 * to subtract it from the header data, or null out the header
306 * data altogether if we wrote more than hdtrl.iov_len bytes.
307 * We change nwritten to be the number of file bytes written.
310 if (hdtrl.iov_base && hdtrl.iov_len) {
311 if (nwritten >= hdtrl.iov_len) {
312 nwritten -= hdtrl.iov_len;
313 hdtrl.iov_base = NULL;
317 (void *)((caddr_t)hdtrl.iov_base + nwritten);
318 hdtrl.iov_len -= nwritten;
325 return count + hdr_len;
329 * Handles the subtleties of using sendfile with CIFS.
331 ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
332 const DATA_BLOB *header, SMB_OFF_T offset,
338 START_PROFILE_BYTES(syscall_sendfile, count);
340 if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
341 PARM_ATOMIC_SENDFILE,
342 PARM_ATOMIC_SENDFILE_DEFAULT)) {
346 /* Try the sendfile */
347 ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
350 /* If the sendfile wasn't atomic, we're done. */
352 DEBUG(10, ("non-atomic sendfile read %ul bytes\n", ret));
353 END_PROFILE(syscall_sendfile);
358 * Atomic sendfile takes care to not write anything to the socket
359 * until all of the requested bytes have been read from the file.
360 * There are two atomic cases that need to be handled.
362 * 1. The file was truncated causing less data to be read than was
363 * requested. In this case, we return back to the caller to
364 * indicate 0 bytes were written to the socket. This should
365 * prompt the caller to fallback to the standard read path: read
366 * the data, create a header that indicates how many bytes were
367 * actually read, and send the header/data back to the client.
369 * This saves us from standard sendfile behavior of sending a
370 * header promising more data then will actually be sent. The
371 * only two options are to close the socket and kill the client
372 * connection, or write a bunch of 0s. Closing the client
373 * connection is bad because there could actually be multiple
374 * sessions multiplexed from the same client that are all dropped
375 * because of a truncate. Writing the remaining data as 0s also
376 * isn't good, because the client will have an incorrect version
377 * of the file. If the file is written back to the server, the 0s
378 * will be written back. Fortunately, atomic sendfile allows us
379 * to avoid making this choice in most cases.
381 * 2. One downside of atomic sendfile, is that there is a limit on
382 * the number of bytes that can be sent atomically. The kernel
383 * has a limited amount of mbuf space that it can read file data
384 * into without exhausting the system's mbufs, so a buffer of
385 * length xfsize is used. The xfsize at the time of writing this
386 * is 64K. xfsize bytes are read from the file, and subsequently
387 * written to the socket. This makes it impossible to do the
388 * sendfile atomically for a byte count > xfsize.
390 * To cope with large requests, atomic sendfile returns -1 with
391 * errno set to E2BIG. Since windows maxes out at 64K writes,
392 * this is currently only a concern with non-windows clients.
393 * Posix extensions allow the full 24bit bytecount field to be
394 * used in ReadAndX, and clients such as smbclient and the linux
395 * cifs client can request up to 16MB reads! There are a few
396 * options for handling large sendfile requests.
398 * a. Fall back to the standard read path. This is unacceptable
399 * because it would require prohibitively large mallocs.
401 * b. Fall back to using samba's fake_send_file which emulates
402 * the kernel sendfile in userspace. This still has the same
403 * problem of sending the header before all of the data has
404 * been read, so it doesn't buy us anything, and has worse
405 * performance than the kernel's zero-copy sendfile.
407 * c. Use non-atomic sendfile syscall to attempt a zero copy
408 * read, and hope that there isn't a short read due to
409 * truncation. In the case of a short read, there are two
412 * 1. Kill the client connection
414 * 2. Write zeros to the socket for the remaining bytes
415 * promised in the header.
417 * It is safer from a data corruption perspective to kill the
418 * client connection, so this is our default behavior, but if
419 * this causes problems this can be configured to write zeros
423 /* Handle case 1: short read -> truncated file. */
425 END_PROFILE(syscall_sendfile);
429 /* Handle case 2: large read. */
430 if (ret == -1 && errno == E2BIG) {
432 if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
433 PARM_SENDFILE_LARGE_READS,
434 PARM_SENDFILE_LARGE_READS_DEFAULT)) {
435 DEBUG(3, ("Not attempting non-atomic large sendfile: "
436 "%lu bytes\n", count));
437 END_PROFILE(syscall_sendfile);
441 if (count < 0x10000) {
442 DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu\n",
446 DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
449 /* Try a non-atomic sendfile. */
450 ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
452 /* Real error: kill the client connection. */
454 DEBUG(1, ("error on non-atomic large sendfile "
455 "(%lu bytes): %s\n", count,
457 END_PROFILE(syscall_sendfile);
461 /* Short read: kill the client connection. */
462 if (ret != count + header->length) {
463 DEBUG(1, ("short read on non-atomic large sendfile "
464 "(%lu of %lu bytes): %s\n", ret, count,
468 * Returning ret here would cause us to drop into the
469 * codepath that calls sendfile_short_send, which
470 * sends the client a bunch of zeros instead.
471 * Returning -1 kills the connection.
473 if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
475 PARM_SENDFILE_SAFE_DEFAULT)) {
476 END_PROFILE(syscall_sendfile);
480 END_PROFILE(syscall_sendfile);
484 DEBUG(10, ("non-atomic large sendfile successful\n"));
487 /* There was error in the atomic sendfile. */
489 DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
490 atomic ? "atomic" : "non-atomic",
491 count, strerror(errno)));
494 END_PROFILE(syscall_sendfile);
499 * Only talloc the spill buffer once (reallocing when necessary).
501 static char *get_spill_buffer(size_t new_count)
503 static int cur_count = 0;
504 static char *spill_buffer = NULL;
506 /* If a sufficiently sized buffer exists, just return. */
507 if (new_count <= cur_count) {
508 SMB_ASSERT(spill_buffer);
512 /* Allocate the first time. */
513 if (cur_count == 0) {
514 SMB_ASSERT(!spill_buffer);
515 spill_buffer = talloc_array(NULL, char, new_count);
517 cur_count = new_count;
522 /* A buffer exists, but it's not big enough, so realloc. */
523 SMB_ASSERT(spill_buffer);
524 spill_buffer = talloc_realloc(NULL, spill_buffer, char, new_count);
526 cur_count = new_count;
532 * recvfile does zero-copy writes given an fd to write to, and a socket with
533 * some data to write. If recvfile read more than it was able to write, it
534 * spills the data into a buffer. After first reading any additional data
535 * from the socket into the buffer, the spill buffer is then written with a
538 ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
541 char *spill_buffer = NULL;
542 bool socket_drained = false;
544 off_t total_rbytes = 0;
545 off_t total_wbytes = 0;
549 START_PROFILE_BYTES(syscall_recvfile, count);
551 DEBUG(10,("onefs_recvfile: from = %d, to = %d, offset=%llu, count = "
552 "%lu\n", fromfd, tofd, offset, count));
555 END_PROFILE(syscall_recvfile);
560 * Setup up a buffer for recvfile to spill data that has been read
561 * from the socket but not written.
563 spill_buffer = get_spill_buffer(count);
564 if (spill_buffer == NULL) {
570 * Keep trying recvfile until:
571 * - There is no data left to read on the socket, or
572 * - bytes read != bytes written, or
573 * - An error is returned that isn't EINTR/EAGAIN
576 /* Keep track of bytes read/written for recvfile */
580 DEBUG(10, ("calling recvfile loop, offset + total_wbytes = "
581 "%llu, count - total_rbytes = %llu\n",
582 offset + total_wbytes, count - total_rbytes));
584 ret = recvfile(tofd, fromfd, offset + total_wbytes,
585 count - total_wbytes, &rbytes, &wbytes, 0,
588 DEBUG(10, ("recvfile ret = %d, errno = %d, rbytes = %llu, "
589 "wbytes = %llu\n", ret, ret >= 0 ? 0 : errno,
592 /* Update our progress so far */
593 total_rbytes += rbytes;
594 total_wbytes += wbytes;
596 } while ((count - total_rbytes) && (rbytes == wbytes) &&
597 (ret == -1 && (errno == EINTR || errno == EAGAIN)));
599 DEBUG(10, ("total_rbytes = %llu, total_wbytes = %llu\n",
600 total_rbytes, total_wbytes));
602 /* Log if recvfile didn't write everything it read. */
603 if (total_rbytes != total_wbytes) {
604 DEBUG(3, ("partial recvfile: total_rbytes=%llu but "
605 "total_wbytes=%llu, diff = %llu\n", total_rbytes,
606 total_wbytes, total_rbytes - total_wbytes));
607 SMB_ASSERT(total_rbytes > total_wbytes);
611 * If there is still data on the socket, read it off.
613 while (total_rbytes < count) {
615 DEBUG(3, ("shallow recvfile (%s), reading %llu\n",
616 strerror(errno), count - total_rbytes));
619 * Read the remaining data into the spill buffer. recvfile
620 * may already have some data in the spill buffer, so start
621 * filling the buffer at total_rbytes - total_wbytes.
623 ret = sys_read(fromfd,
624 spill_buffer + (total_rbytes - total_wbytes),
625 count - total_rbytes);
629 DEBUG(0, ("shallow recvfile read: EOF\n"));
631 DEBUG(0, ("shallow recvfile read failed: %s\n",
634 /* Socket is dead, so treat as if it were drained. */
635 socket_drained = true;
639 /* Data was read so update the rbytes */
643 if (total_rbytes != count) {
644 smb_panic("Unread recvfile data still on the socket!");
648 * Now write any spilled data + the extra data read off the socket.
650 while (total_wbytes < count) {
652 DEBUG(3, ("partial recvfile, writing %llu\n", count - total_wbytes));
654 ret = sys_pwrite(tofd, spill_buffer, count - total_wbytes,
655 offset + total_wbytes);
658 DEBUG(0, ("partial recvfile write failed: %s\n",
663 /* Data was written so update the wbytes */
672 END_PROFILE(syscall_recvfile);
674 /* Make sure we always try to drain the socket. */
675 if (!socket_drained && count - total_rbytes) {
676 int saved_errno = errno;
678 if (drain_socket(fromfd, count - total_rbytes) !=
679 count - total_rbytes) {
680 /* Socket is dead! */
681 DEBUG(0, ("drain socket failed: %d\n", errno));
689 void init_stat_ex_from_onefs_stat(struct stat_ex *dst, const struct stat *src)
693 dst->st_ex_dev = src->st_dev;
694 dst->st_ex_ino = src->st_ino;
695 dst->st_ex_mode = src->st_mode;
696 dst->st_ex_nlink = src->st_nlink;
697 dst->st_ex_uid = src->st_uid;
698 dst->st_ex_gid = src->st_gid;
699 dst->st_ex_rdev = src->st_rdev;
700 dst->st_ex_size = src->st_size;
701 dst->st_ex_atime = src->st_atimespec;
702 dst->st_ex_mtime = src->st_mtimespec;
703 dst->st_ex_ctime = src->st_ctimespec;
704 dst->st_ex_btime = src->st_birthtimespec;
705 dst->st_ex_blksize = src->st_blksize;
706 dst->st_ex_blocks = src->st_blocks;
708 dst->st_ex_flags = src->st_flags;
710 dst->vfs_private = src->st_snapid;
713 int onefs_sys_stat(const char *fname, SMB_STRUCT_STAT *sbuf)
716 struct stat onefs_sbuf;
718 ret = stat(fname, &onefs_sbuf);
721 /* we always want directories to appear zero size */
722 if (S_ISDIR(onefs_sbuf.st_mode)) {
723 onefs_sbuf.st_size = 0;
725 init_stat_ex_from_onefs_stat(sbuf, &onefs_sbuf);
730 int onefs_sys_fstat(int fd, SMB_STRUCT_STAT *sbuf)
733 struct stat onefs_sbuf;
735 ret = fstat(fd, &onefs_sbuf);
738 /* we always want directories to appear zero size */
739 if (S_ISDIR(onefs_sbuf.st_mode)) {
740 onefs_sbuf.st_size = 0;
742 init_stat_ex_from_onefs_stat(sbuf, &onefs_sbuf);
747 int onefs_sys_fstat_at(int base_fd, const char *fname, SMB_STRUCT_STAT *sbuf,
751 struct stat onefs_sbuf;
753 ret = enc_fstatat(base_fd, fname, ENC_DEFAULT, &onefs_sbuf, flags);
756 /* we always want directories to appear zero size */
757 if (S_ISDIR(onefs_sbuf.st_mode)) {
758 onefs_sbuf.st_size = 0;
760 init_stat_ex_from_onefs_stat(sbuf, &onefs_sbuf);
765 int onefs_sys_lstat(const char *fname, SMB_STRUCT_STAT *sbuf)
768 struct stat onefs_sbuf;
770 ret = lstat(fname, &onefs_sbuf);
773 /* we always want directories to appear zero size */
774 if (S_ISDIR(onefs_sbuf.st_mode)) {
775 onefs_sbuf.st_size = 0;
777 init_stat_ex_from_onefs_stat(sbuf, &onefs_sbuf);