2
* Unix SMB/CIFS implementation.
3
* Support for OneFS system interfaces.
5
* Copyright (C) Tim Prouty, 2008
7
* This program is free software; you can redistribute it and/or modify
8
* it under the terms of the GNU General Public License as published by
9
* the Free Software Foundation; either version 3 of the License, or
10
* (at your option) any later version.
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU General Public License for more details.
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, see <http://www.gnu.org/licenses/>.
23
#include "onefs_config.h"
24
#include "oplock_onefs.h"
26
#include <ifs/ifs_syscalls.h>
27
#include <isi_acl/isi_acl_util.h>
28
#include <sys/isi_acl.h>
31
* Initialize the sm_lock struct before passing it to ifs_createfile.
33
static void smlock_init(connection_struct *conn, struct sm_lock *sml,
34
bool isexe, uint32_t access_mask, uint32_t share_access,
35
uint32_t create_options)
37
sml->sm_type.doc = false;
38
sml->sm_type.isexe = isexe;
39
sml->sm_type.statonly = is_stat_open(access_mask);
40
sml->sm_type.access_mask = access_mask;
41
sml->sm_type.share_access = share_access;
44
* private_options was previously used for DENY_DOS/DENY_FCB checks in
45
* the kernel, but are now properly handled by fcb_or_dos_open. In
46
* these cases, ifs_createfile will return a sharing violation, which
47
* gives fcb_or_dos_open the chance to open a duplicate file handle.
49
sml->sm_type.private_options = 0;
51
/* 1 second delay is handled in onefs_open.c by deferring the open */
52
sml->sm_timeout = timeval_set(0, 0);
55
static void smlock_dump(int debuglevel, const struct sm_lock *sml)
58
DEBUG(debuglevel, ("sml == NULL\n"));
63
("smlock: doc=%s, isexec=%s, statonly=%s, access_mask=0x%x, "
64
"share_access=0x%x, private_options=0x%x timeout=%d/%d\n",
65
sml->sm_type.doc ? "True" : "False",
66
sml->sm_type.isexe ? "True" : "False",
67
sml->sm_type.statonly ? "True" : "False",
68
sml->sm_type.access_mask,
69
sml->sm_type.share_access,
70
sml->sm_type.private_options,
71
(int)sml->sm_timeout.tv_sec,
72
(int)sml->sm_timeout.tv_usec));
76
* External interface to ifs_createfile
78
int onefs_sys_create_file(connection_struct *conn,
82
uint32_t open_access_mask,
83
uint32_t share_access,
84
uint32_t create_options,
89
struct security_descriptor *sd,
93
struct sm_lock sml, *psml = NULL;
94
enum oplock_type onefs_oplock;
95
enum oplock_type onefs_granted_oplock = OPLOCK_NONE;
96
struct ifs_security_descriptor ifs_sd = {}, *pifs_sd = NULL;
97
uint32_t sec_info_effective = 0;
99
uint32_t onefs_dos_attributes;
100
struct ifs_createfile_flags cf_flags = CF_FLAGS_NONE;
102
START_PROFILE(syscall_createfile);
104
/* Setup security descriptor and get secinfo. */
107
uint32_t sec_info_sent = 0;
109
sec_info_sent = (get_sec_info(sd) & IFS_SEC_INFO_KNOWN_MASK);
111
status = onefs_samba_sd_to_sd(sec_info_sent, sd, &ifs_sd,
112
SNUM(conn), &sec_info_effective);
114
if (!NT_STATUS_IS_OK(status)) {
115
DEBUG(1, ("SD initialization failure: %s\n",
124
/* Stripping off private bits will be done for us. */
125
onefs_oplock = onefs_samba_oplock_to_oplock(oplock_request);
127
if (!lp_oplocks(SNUM(conn))) {
128
SMB_ASSERT(onefs_oplock == OPLOCK_NONE);
131
/* Convert samba dos flags to UF_DOS_* attributes. */
132
onefs_dos_attributes = dos_attributes_to_stat_dos_flags(dos_flags);
135
* Deal with kernel creating Default ACLs. (Isilon bug 47447.)
137
* 1) "nt acl support = no", default_acl = no
138
* 2) "inherit permissions = yes", default_acl = no
140
if (lp_nt_acl_support(SNUM(conn)) && !lp_inherit_perms(SNUM(conn)))
141
cf_flags = cf_flags_or(cf_flags, CF_FLAGS_DEFAULT_ACL);
144
* Some customer workflows require the execute bit to be ignored.
146
if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
147
PARM_ALLOW_EXECUTE_ALWAYS,
148
PARM_ALLOW_EXECUTE_ALWAYS_DEFAULT) &&
149
(open_access_mask & FILE_EXECUTE)) {
151
DEBUG(3, ("Stripping execute bit from %s: (0x%x)\n", path,
155
open_access_mask &= ~FILE_EXECUTE;
158
* Add READ_DATA, so we're not left with desired_access=0. An
159
* execute call should imply the client will read the data.
161
open_access_mask |= FILE_READ_DATA;
163
DEBUGADD(3, ("New stripped access mask: 0x%x\n",
167
DEBUG(10,("onefs_sys_create_file: base_fd = %d, fname = %s"
168
"open_access_mask = 0x%x, flags = 0x%x, mode = 0%o, "
169
"desired_oplock = %s, id = 0x%x, secinfo = 0x%x, sd = %p, "
170
"dos_attributes = 0x%x, path = %s, "
171
"default_acl=%s\n", base_fd, path,
172
(unsigned int)open_access_mask,
175
onefs_oplock_str(onefs_oplock),
177
sec_info_effective, sd,
178
(unsigned int)onefs_dos_attributes, path,
179
cf_flags_and_bool(cf_flags, CF_FLAGS_DEFAULT_ACL) ?
182
/* Initialize smlock struct for files/dirs but not internal opens */
183
if (!(oplock_request & INTERNAL_OPEN_ONLY)) {
184
smlock_init(conn, &sml, is_executable(path), access_mask,
185
share_access, create_options);
189
smlock_dump(10, psml);
191
ret_fd = ifs_createfile(base_fd, path,
192
(enum ifs_ace_rights)open_access_mask, flags & ~O_ACCMODE, mode,
193
onefs_oplock, id, psml, sec_info_effective, pifs_sd,
194
onefs_dos_attributes, cf_flags, &onefs_granted_oplock);
196
DEBUG(10,("onefs_sys_create_file(%s): ret_fd = %d, "
197
"onefs_granted_oplock = %s\n",
198
ret_fd < 0 ? strerror(errno) : "success", ret_fd,
199
onefs_oplock_str(onefs_granted_oplock)));
201
if (granted_oplock) {
203
onefs_oplock_to_samba_oplock(onefs_granted_oplock);
207
END_PROFILE(syscall_createfile);
208
aclu_free_sd(pifs_sd, false);
214
* FreeBSD based sendfile implementation that allows for atomic semantics.
216
static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
217
const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
229
hdr.headers = &hdtrl;
234
/* Set up the header iovec. */
236
hdtrl.iov_base = header->data;
237
hdtrl.iov_len = hdr_len = header->length;
239
hdtrl.iov_base = NULL;
244
while (total + hdtrl.iov_len) {
249
* FreeBSD sendfile returns 0 on success, -1 on error.
250
* Remember, the tofd and fromfd are reversed..... :-).
251
* nwritten includes the header data sent.
255
ret = sendfile(fromfd, tofd, offset, total, &hdr,
257
} while (ret == -1 && errno == EINTR);
259
/* On error we're done. */
265
* If this was an ATOMIC sendfile, nwritten doesn't
266
* necessarily indicate an error. It could mean count > than
267
* what sendfile can handle atomically (usually 64K) or that
268
* there was a short read due to the file being truncated.
271
return atomic ? 0 : -1;
275
* An atomic sendfile should never send partial data!
277
if (atomic && nwritten != total + hdtrl.iov_len) {
278
DEBUG(0,("Atomic sendfile() sent partial data: "
279
"%llu of %d\n", nwritten,
280
total + hdtrl.iov_len));
285
* If this was a short (signal interrupted) write we may need
286
* to subtract it from the header data, or null out the header
287
* data altogether if we wrote more than hdtrl.iov_len bytes.
288
* We change nwritten to be the number of file bytes written.
291
if (hdtrl.iov_base && hdtrl.iov_len) {
292
if (nwritten >= hdtrl.iov_len) {
293
nwritten -= hdtrl.iov_len;
294
hdtrl.iov_base = NULL;
298
(caddr_t)hdtrl.iov_base + nwritten;
299
hdtrl.iov_len -= nwritten;
306
return count + hdr_len;
310
* Handles the subtleties of using sendfile with CIFS.
312
ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
313
const DATA_BLOB *header, SMB_OFF_T offset,
319
START_PROFILE_BYTES(syscall_sendfile, count);
321
if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
322
PARM_ATOMIC_SENDFILE,
323
PARM_ATOMIC_SENDFILE_DEFAULT)) {
327
/* Try the sendfile */
328
ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
331
/* If the sendfile wasn't atomic, we're done. */
333
DEBUG(10, ("non-atomic sendfile read %ul bytes\n", ret));
334
END_PROFILE(syscall_sendfile);
339
* Atomic sendfile takes care to not write anything to the socket
340
* until all of the requested bytes have been read from the file.
341
* There are two atomic cases that need to be handled.
343
* 1. The file was truncated causing less data to be read than was
344
* requested. In this case, we return back to the caller to
345
* indicate 0 bytes were written to the socket. This should
346
* prompt the caller to fallback to the standard read path: read
347
* the data, create a header that indicates how many bytes were
348
* actually read, and send the header/data back to the client.
350
* This saves us from standard sendfile behavior of sending a
351
* header promising more data then will actually be sent. The
352
* only two options are to close the socket and kill the client
353
* connection, or write a bunch of 0s. Closing the client
354
* connection is bad because there could actually be multiple
355
* sessions multiplexed from the same client that are all dropped
356
* because of a truncate. Writing the remaining data as 0s also
357
* isn't good, because the client will have an incorrect version
358
* of the file. If the file is written back to the server, the 0s
359
* will be written back. Fortunately, atomic sendfile allows us
360
* to avoid making this choice in most cases.
362
* 2. One downside of atomic sendfile, is that there is a limit on
363
* the number of bytes that can be sent atomically. The kernel
364
* has a limited amount of mbuf space that it can read file data
365
* into without exhausting the system's mbufs, so a buffer of
366
* length xfsize is used. The xfsize at the time of writing this
367
* is 64K. xfsize bytes are read from the file, and subsequently
368
* written to the socket. This makes it impossible to do the
369
* sendfile atomically for a byte count > xfsize.
371
* To cope with large requests, atomic sendfile returns -1 with
372
* errno set to E2BIG. Since windows maxes out at 64K writes,
373
* this is currently only a concern with non-windows clients.
374
* Posix extensions allow the full 24bit bytecount field to be
375
* used in ReadAndX, and clients such as smbclient and the linux
376
* cifs client can request up to 16MB reads! There are a few
377
* options for handling large sendfile requests.
379
* a. Fall back to the standard read path. This is unacceptable
380
* because it would require prohibitively large mallocs.
382
* b. Fall back to using samba's fake_send_file which emulates
383
* the kernel sendfile in userspace. This still has the same
384
* problem of sending the header before all of the data has
385
* been read, so it doesn't buy us anything, and has worse
386
* performance than the kernel's zero-copy sendfile.
388
* c. Use non-atomic sendfile syscall to attempt a zero copy
389
* read, and hope that there isn't a short read due to
390
* truncation. In the case of a short read, there are two
393
* 1. Kill the client connection
395
* 2. Write zeros to the socket for the remaining bytes
396
* promised in the header.
398
* It is safer from a data corruption perspective to kill the
399
* client connection, so this is our default behavior, but if
400
* this causes problems this can be configured to write zeros
404
/* Handle case 1: short read -> truncated file. */
406
END_PROFILE(syscall_sendfile);
410
/* Handle case 2: large read. */
411
if (ret == -1 && errno == E2BIG) {
413
if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
414
PARM_SENDFILE_LARGE_READS,
415
PARM_SENDFILE_LARGE_READS_DEFAULT)) {
416
DEBUG(3, ("Not attempting non-atomic large sendfile: "
417
"%lu bytes\n", count));
418
END_PROFILE(syscall_sendfile);
422
if (count < 0x10000) {
423
DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu\n",
427
DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
430
/* Try a non-atomic sendfile. */
431
ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
433
/* Real error: kill the client connection. */
435
DEBUG(1, ("error on non-atomic large sendfile "
436
"(%lu bytes): %s\n", count,
438
END_PROFILE(syscall_sendfile);
442
/* Short read: kill the client connection. */
443
if (ret != count + header->length) {
444
DEBUG(1, ("short read on non-atomic large sendfile "
445
"(%lu of %lu bytes): %s\n", ret, count,
449
* Returning ret here would cause us to drop into the
450
* codepath that calls sendfile_short_send, which
451
* sends the client a bunch of zeros instead.
452
* Returning -1 kills the connection.
454
if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
456
PARM_SENDFILE_SAFE_DEFAULT)) {
457
END_PROFILE(syscall_sendfile);
461
END_PROFILE(syscall_sendfile);
465
DEBUG(10, ("non-atomic large sendfile successful\n"));
468
/* There was error in the atomic sendfile. */
470
DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
471
atomic ? "atomic" : "non-atomic",
472
count, strerror(errno)));
475
END_PROFILE(syscall_sendfile);
480
* Only talloc the spill buffer once (reallocing when necessary).
482
static char *get_spill_buffer(size_t new_count)
484
static int cur_count = 0;
485
static char *spill_buffer = NULL;
487
/* If a sufficiently sized buffer exists, just return. */
488
if (new_count <= cur_count) {
489
SMB_ASSERT(spill_buffer);
493
/* Allocate the first time. */
494
if (cur_count == 0) {
495
SMB_ASSERT(!spill_buffer);
496
spill_buffer = talloc_array(NULL, char, new_count);
498
cur_count = new_count;
503
/* A buffer exists, but it's not big enough, so realloc. */
504
SMB_ASSERT(spill_buffer);
505
spill_buffer = talloc_realloc(NULL, spill_buffer, char, new_count);
507
cur_count = new_count;
513
* recvfile does zero-copy writes given an fd to write to, and a socket with
514
* some data to write. If recvfile read more than it was able to write, it
515
* spills the data into a buffer. After first reading any additional data
516
* from the socket into the buffer, the spill buffer is then written with a
519
ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
522
char *spill_buffer = NULL;
523
bool socket_drained = false;
525
off_t total_rbytes = 0;
526
off_t total_wbytes = 0;
530
START_PROFILE_BYTES(syscall_recvfile, count);
532
DEBUG(10,("onefs_recvfile: from = %d, to = %d, offset=%llu, count = "
533
"%lu\n", fromfd, tofd, offset, count));
536
END_PROFILE(syscall_recvfile);
541
* Setup up a buffer for recvfile to spill data that has been read
542
* from the socket but not written.
544
spill_buffer = get_spill_buffer(count);
545
if (spill_buffer == NULL) {
551
* Keep trying recvfile until:
552
* - There is no data left to read on the socket, or
553
* - bytes read != bytes written, or
554
* - An error is returned that isn't EINTR/EAGAIN
557
/* Keep track of bytes read/written for recvfile */
561
DEBUG(10, ("calling recvfile loop, offset + total_wbytes = "
562
"%llu, count - total_rbytes = %llu\n",
563
offset + total_wbytes, count - total_rbytes));
565
ret = recvfile(tofd, fromfd, offset + total_wbytes,
566
count - total_wbytes, &rbytes, &wbytes, 0,
569
DEBUG(10, ("recvfile ret = %d, errno = %d, rbytes = %llu, "
570
"wbytes = %llu\n", ret, ret >= 0 ? 0 : errno,
573
/* Update our progress so far */
574
total_rbytes += rbytes;
575
total_wbytes += wbytes;
577
} while ((count - total_rbytes) && (rbytes == wbytes) &&
578
(ret == -1 && (errno == EINTR || errno == EAGAIN)));
580
DEBUG(10, ("total_rbytes = %llu, total_wbytes = %llu\n",
581
total_rbytes, total_wbytes));
583
/* Log if recvfile didn't write everything it read. */
584
if (total_rbytes != total_wbytes) {
585
DEBUG(3, ("partial recvfile: total_rbytes=%llu but "
586
"total_wbytes=%llu, diff = %llu\n", total_rbytes,
587
total_wbytes, total_rbytes - total_wbytes));
588
SMB_ASSERT(total_rbytes > total_wbytes);
592
* If there is still data on the socket, read it off.
594
while (total_rbytes < count) {
596
DEBUG(3, ("shallow recvfile (%s), reading %llu\n",
597
strerror(errno), count - total_rbytes));
600
* Read the remaining data into the spill buffer. recvfile
601
* may already have some data in the spill buffer, so start
602
* filling the buffer at total_rbytes - total_wbytes.
604
ret = sys_read(fromfd,
605
spill_buffer + (total_rbytes - total_wbytes),
606
count - total_rbytes);
610
DEBUG(0, ("shallow recvfile read: EOF\n"));
612
DEBUG(0, ("shallow recvfile read failed: %s\n",
615
/* Socket is dead, so treat as if it were drained. */
616
socket_drained = true;
620
/* Data was read so update the rbytes */
624
if (total_rbytes != count) {
625
smb_panic("Unread recvfile data still on the socket!");
629
* Now write any spilled data + the extra data read off the socket.
631
while (total_wbytes < count) {
633
DEBUG(3, ("partial recvfile, writing %llu\n", count - total_wbytes));
635
ret = sys_pwrite(tofd, spill_buffer, count - total_wbytes,
636
offset + total_wbytes);
639
DEBUG(0, ("partial recvfile write failed: %s\n",
644
/* Data was written so update the wbytes */
653
END_PROFILE(syscall_recvfile);
655
/* Make sure we always try to drain the socket. */
656
if (!socket_drained && count - total_rbytes) {
657
int saved_errno = errno;
659
if (drain_socket(fromfd, count - total_rbytes) !=
660
count - total_rbytes) {
661
/* Socket is dead! */
662
DEBUG(0, ("drain socket failed: %d\n", errno));