2
* RAW transport layer over SOCK_STREAM sockets.
4
* Copyright 2000-2012 Willy Tarreau <w@1wt.eu>
6
* This program is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU General Public License
8
* as published by the Free Software Foundation; either version
9
* 2 of the License, or (at your option) any later version.
19
#include <sys/socket.h>
21
#include <sys/types.h>
23
#include <netinet/tcp.h>
25
#include <common/buffer.h>
26
#include <common/compat.h>
27
#include <common/config.h>
28
#include <common/debug.h>
29
#include <common/standard.h>
30
#include <common/ticks.h>
31
#include <common/time.h>
33
#include <proto/connection.h>
35
#include <proto/freq_ctr.h>
36
#include <proto/log.h>
37
#include <proto/pipe.h>
38
#include <proto/raw_sock.h>
39
#include <proto/stream_interface.h>
40
#include <proto/task.h>
42
#include <types/global.h>
45
#if defined(CONFIG_HAP_LINUX_SPLICE)
46
#include <common/splice.h>
48
/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes
49
* because of timestamps. Use this as a hint for not looping on splice().
51
#define SPLICE_FULL_HINT 16*1448
53
/* how many data we attempt to splice at once when the buffer is configured for
54
* infinite forwarding */
55
#define MAX_SPLICE_AT_ONCE (1<<30)
57
/* Versions of splice between 2.6.25 and 2.6.27.12 were bogus and would return EAGAIN
58
* on incoming shutdowns. On these versions, we have to call recv() after such a return
59
* in order to find whether splice is OK or not. Since 2.6.27.13 we don't need to do
60
* this anymore, and we can avoid this logic by defining ASSUME_SPLICE_WORKS.
64
* -1 if splice() is not supported
65
* >= 0 to report the amount of spliced bytes.
66
* connection flags are updated (error, read0, wait_room, wait_data).
67
* The caller must have previously allocated the pipe.
69
int raw_sock_to_pipe(struct connection *conn, struct pipe *pipe, unsigned int count)
71
#ifndef ASSUME_SPLICE_WORKS
72
static int splice_detects_close;
78
if (!conn_ctrl_ready(conn))
81
if (!fd_recv_ready(conn->t.sock.fd))
86
/* Under Linux, if FD_POLL_HUP is set, we have reached the end.
87
* Since older splice() implementations were buggy and returned
88
* EAGAIN on end of read, let's bypass the call to splice() now.
90
if (unlikely(!(fdtab[conn->t.sock.fd].ev & FD_POLL_IN))) {
91
/* stop here if we reached the end of data */
92
if ((fdtab[conn->t.sock.fd].ev & (FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_HUP)
95
/* report error on POLL_ERR before connection establishment */
96
if ((fdtab[conn->t.sock.fd].ev & FD_POLL_ERR) && (conn->flags & CO_FL_WAIT_L4_CONN)) {
97
conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH;
98
errno = 0; /* let the caller do a getsockopt() if it wants it */
104
if (count > MAX_SPLICE_AT_ONCE)
105
count = MAX_SPLICE_AT_ONCE;
107
ret = splice(conn->t.sock.fd, NULL, pipe->prod, NULL, count,
108
SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
112
/* connection closed. This is only detected by
113
* recent kernels (>= 2.6.27.13). If we notice
114
* it works, we store the info for later use.
116
#ifndef ASSUME_SPLICE_WORKS
117
splice_detects_close = 1;
122
if (errno == EAGAIN) {
123
/* there are two reasons for EAGAIN :
124
* - nothing in the socket buffer (standard)
126
* - the connection is closed (kernel < 2.6.27.13)
127
* The last case is annoying but know if we can detect it
128
* and if we can't then we rely on the call to recv() to
129
* get a valid verdict. The difference between the first
130
* two situations is problematic. Since we don't know if
131
* the pipe is full, we'll stop if the pipe is not empty.
132
* Anyway, we will almost always fill/empty the pipe.
135
/* alway stop reading until the pipe is flushed */
136
conn->flags |= CO_FL_WAIT_ROOM;
140
/* We don't know if the connection was closed,
141
* but if we know splice detects close, then we
143
* But if we're called upon POLLIN with an empty
144
* pipe and get EAGAIN, it is suspect enough to
145
* try to fall back to the normal recv scheme
146
* which will be able to deal with the situation.
148
#ifndef ASSUME_SPLICE_WORKS
149
if (splice_detects_close)
151
fd_cant_recv(conn->t.sock.fd); /* we know for sure that it's EAGAIN */
154
else if (errno == ENOSYS || errno == EINVAL || errno == EBADF) {
155
/* splice not supported on this end, disable it.
156
* We can safely return -1 since there is no
157
* chance that any data has been piped yet.
161
else if (errno == EINTR) {
165
/* here we have another error */
166
conn->flags |= CO_FL_ERROR;
174
if (pipe->data >= SPLICE_FULL_HINT || ret >= global.tune.recv_enough) {
175
/* We've read enough of it for this time, let's stop before
176
* being asked to poll.
178
conn->flags |= CO_FL_WAIT_ROOM;
179
fd_done_recv(conn->t.sock.fd);
184
if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && retval)
185
conn->flags &= ~CO_FL_WAIT_L4_CONN;
189
conn_sock_read0(conn);
190
conn->flags &= ~CO_FL_WAIT_L4_CONN;
194
/* Send as many bytes as possible from the pipe to the connection's socket.
196
int raw_sock_from_pipe(struct connection *conn, struct pipe *pipe)
200
if (!conn_ctrl_ready(conn))
203
if (!fd_send_ready(conn->t.sock.fd))
208
ret = splice(pipe->cons, NULL, conn->t.sock.fd, NULL, pipe->data,
209
SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
212
if (ret == 0 || errno == EAGAIN) {
213
fd_cant_send(conn->t.sock.fd);
216
else if (errno == EINTR)
219
/* here we have another error */
220
conn->flags |= CO_FL_ERROR;
227
if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done)
228
conn->flags &= ~CO_FL_WAIT_L4_CONN;
232
#endif /* CONFIG_HAP_LINUX_SPLICE */
235
/* Receive up to <count> bytes from connection <conn>'s socket and store them
236
* into buffer <buf>. Only one call to recv() is performed, unless the
237
* buffer wraps, in which case a second call may be performed. The connection's
238
* flags are updated with whatever special event is detected (error, read0,
239
* empty). The caller is responsible for taking care of those events and
240
* avoiding the call if inappropriate. The function does not call the
241
* connection's polling update function, so the caller is responsible for this.
242
* errno is cleared before starting so that the caller knows that if it spots an
243
* error without errno, it's pending and can be retrieved via getsockopt(SO_ERROR).
245
static int raw_sock_to_buf(struct connection *conn, struct buffer *buf, int count)
250
if (!conn_ctrl_ready(conn))
253
if (!fd_recv_ready(conn->t.sock.fd))
258
if (unlikely(!(fdtab[conn->t.sock.fd].ev & FD_POLL_IN))) {
259
/* stop here if we reached the end of data */
260
if ((fdtab[conn->t.sock.fd].ev & (FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_HUP)
263
/* report error on POLL_ERR before connection establishment */
264
if ((fdtab[conn->t.sock.fd].ev & FD_POLL_ERR) && (conn->flags & CO_FL_WAIT_L4_CONN)) {
265
conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH;
270
/* let's realign the buffer to optimize I/O */
271
if (buffer_empty(buf))
274
/* read the largest possible block. For this, we perform only one call
275
* to recv() unless the buffer wraps and we exactly fill the first hunk,
276
* in which case we accept to do it once again. A new attempt is made on
280
/* first check if we have some room after p+i */
281
try = buf->data + buf->size - (buf->p + buf->i);
282
/* otherwise continue between data and p-o */
284
try = buf->p - (buf->data + buf->o);
291
ret = recv(conn->t.sock.fd, bi_end(buf), try, 0);
297
/* unfortunately, on level-triggered events, POLL_HUP
298
* is generally delivered AFTER the system buffer is
299
* empty, so this one might never match.
301
if (fdtab[conn->t.sock.fd].ev & FD_POLL_HUP)
304
fd_done_recv(conn->t.sock.fd);
312
else if (errno == EAGAIN || errno == ENOTCONN) {
313
fd_cant_recv(conn->t.sock.fd);
316
else if (errno != EINTR) {
317
conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH;
322
if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done)
323
conn->flags &= ~CO_FL_WAIT_L4_CONN;
327
conn_sock_read0(conn);
328
conn->flags &= ~CO_FL_WAIT_L4_CONN;
330
/* Now a final check for a possible asynchronous low-level error
331
* report. This can happen when a connection receives a reset
332
* after a shutdown, both POLL_HUP and POLL_ERR are queued, and
333
* we might have come from there by just checking POLL_HUP instead
334
* of recv()'s return value 0, so we have no way to tell there was
335
* an error without checking.
337
if (unlikely(fdtab[conn->t.sock.fd].ev & FD_POLL_ERR))
338
conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH;
343
/* Send all pending bytes from buffer <buf> to connection <conn>'s socket.
344
* <flags> may contain some CO_SFL_* flags to hint the system about other
345
* pending data for example.
346
* Only one call to send() is performed, unless the buffer wraps, in which case
347
* a second call may be performed. The connection's flags are updated with
348
* whatever special event is detected (error, empty). The caller is responsible
349
* for taking care of those events and avoiding the call if inappropriate. The
350
* function does not call the connection's polling update function, so the caller
351
* is responsible for this.
353
static int raw_sock_from_buf(struct connection *conn, struct buffer *buf, int flags)
355
int ret, try, done, send_flag;
357
if (!conn_ctrl_ready(conn))
360
if (!fd_send_ready(conn->t.sock.fd))
364
/* send the largest possible block. For this we perform only one call
365
* to send() unless the buffer wraps and we exactly fill the first hunk,
366
* in which case we accept to do it once again.
370
/* outgoing data may wrap at the end */
371
if (buf->data + try > buf->p)
372
try = buf->data + try - buf->p;
374
send_flag = MSG_DONTWAIT | MSG_NOSIGNAL;
375
if (try < buf->o || flags & CO_SFL_MSG_MORE)
376
send_flag |= MSG_MORE;
378
ret = send(conn->t.sock.fd, bo_ptr(buf), try, send_flag);
384
if (likely(buffer_empty(buf)))
385
/* optimize data alignment in the buffer */
388
/* if the system buffer is full, don't insist */
392
else if (ret == 0 || errno == EAGAIN || errno == ENOTCONN) {
393
/* nothing written, we need to poll for write first */
394
fd_cant_send(conn->t.sock.fd);
397
else if (errno != EINTR) {
398
conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH;
402
if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done)
403
conn->flags &= ~CO_FL_WAIT_L4_CONN;
408
/* transport-layer operations for RAW sockets */
409
struct xprt_ops raw_sock = {
410
.snd_buf = raw_sock_from_buf,
411
.rcv_buf = raw_sock_to_buf,
412
#if defined(CONFIG_HAP_LINUX_SPLICE)
413
.rcv_pipe = raw_sock_to_pipe,
414
.snd_pipe = raw_sock_from_pipe,