142
142
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143
__be32 saddr, __be32 daddr, struct ip_options *opt)
143
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145
145
struct inet_sock *inet = inet_sk(sk);
146
146
struct rtable *rt = skb_rtable(skb);
147
147
struct iphdr *iph;
149
149
/* Build the IP header. */
150
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
150
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151
151
skb_reset_network_header(skb);
152
152
iph = ip_hdr(skb);
153
153
iph->version = 4;
159
159
iph->frag_off = 0;
160
160
iph->ttl = ip_select_ttl(inet, &rt->dst);
161
iph->daddr = rt->rt_dst;
162
iph->saddr = rt->rt_src;
161
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163
163
iph->protocol = sk->sk_protocol;
164
164
ip_select_ident(iph, &rt->dst, sk);
166
if (opt && opt->optlen) {
167
iph->ihl += opt->optlen>>2;
168
ip_options_build(skb, opt, daddr, rt, 0);
166
if (opt && opt->opt.optlen) {
167
iph->ihl += opt->opt.optlen>>2;
168
ip_options_build(skb, &opt->opt, daddr, rt, 0);
171
171
skb->priority = sk->sk_priority;
312
312
!(IPCB(skb)->flags & IPSKB_REROUTED));
315
int ip_queue_xmit(struct sk_buff *skb)
315
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
317
317
struct sock *sk = skb->sk;
318
318
struct inet_sock *inet = inet_sk(sk);
319
struct ip_options *opt = inet->opt;
319
struct ip_options_rcu *inet_opt;
320
321
struct rtable *rt;
321
322
struct iphdr *iph;
337
340
/* Use correct destination address if we have options. */
338
341
daddr = inet->inet_daddr;
343
struct flowi fl = { .oif = sk->sk_bound_dev_if,
346
.fl4_src = inet->inet_saddr,
347
.fl4_tos = RT_CONN_FLAGS(sk),
348
.proto = sk->sk_protocol,
349
.flags = inet_sk_flowi_flags(sk),
350
.fl_ip_sport = inet->inet_sport,
351
.fl_ip_dport = inet->inet_dport };
353
/* If this fails, retransmit mechanism of transport layer will
354
* keep trying until route appears or the connection times
357
security_sk_classify_flow(sk, &fl);
358
if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
342
if (inet_opt && inet_opt->opt.srr)
343
daddr = inet_opt->opt.faddr;
345
/* If this fails, retransmit mechanism of transport layer will
346
* keep trying until route appears or the connection times
349
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350
daddr, inet->inet_saddr,
355
sk->sk_bound_dev_if);
361
358
sk_setup_caps(sk, &rt->dst);
363
360
skb_dst_set_noref(skb, &rt->dst);
366
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
363
if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
369
366
/* OK, we know where to send it, allocate and build IP header. */
370
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
367
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
371
368
skb_reset_network_header(skb);
372
369
iph = ip_hdr(skb);
373
370
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
377
374
iph->frag_off = 0;
378
375
iph->ttl = ip_select_ttl(inet, &rt->dst);
379
376
iph->protocol = sk->sk_protocol;
380
iph->saddr = rt->rt_src;
381
iph->daddr = rt->rt_dst;
377
iph->saddr = fl4->saddr;
378
iph->daddr = fl4->daddr;
382
379
/* Transport layer set skb->h.foo itself. */
384
if (opt && opt->optlen) {
385
iph->ihl += opt->optlen >> 2;
386
ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
381
if (inet_opt && inet_opt->opt.optlen) {
382
iph->ihl += inet_opt->opt.optlen >> 2;
383
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
389
386
ip_select_ident_more(iph, &rt->dst, sk,
735
732
static inline int ip_ufo_append_data(struct sock *sk,
733
struct sk_buff_head *queue,
736
734
int getfrag(void *from, char *to, int offset, int len,
737
735
int odd, struct sk_buff *skb),
738
736
void *from, int length, int hh_len, int fragheaderlen,
745
743
* device, so create one single skb packet containing complete
748
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
746
if ((skb = skb_peek_tail(queue)) == NULL) {
749
747
skb = sock_alloc_send_skb(sk,
750
748
hh_len + fragheaderlen + transhdrlen + 20,
751
749
(flags & MSG_DONTWAIT), &err);
768
766
skb->ip_summed = CHECKSUM_PARTIAL;
770
sk->sk_sndmsg_off = 0;
772
769
/* specify the length of each IP datagram fragment */
773
770
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774
771
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775
__skb_queue_tail(&sk->sk_write_queue, skb);
772
__skb_queue_tail(queue, skb);
778
775
return skb_append_datato_frags(sk, skb, getfrag, from,
779
776
(length - transhdrlen));
783
* ip_append_data() and ip_append_page() can make one large IP datagram
784
* from many pieces of data. Each pieces will be holded on the socket
785
* until ip_push_pending_frames() is called. Each piece can be a page
788
* Not only UDP, other transport protocols - e.g. raw sockets - can use
789
* this interface potentially.
791
* LATER: length must be adjusted by pad at tail, when it is required.
793
int ip_append_data(struct sock *sk,
794
int getfrag(void *from, char *to, int offset, int len,
795
int odd, struct sk_buff *skb),
796
void *from, int length, int transhdrlen,
797
struct ipcm_cookie *ipc, struct rtable **rtp,
779
static int __ip_append_data(struct sock *sk,
781
struct sk_buff_head *queue,
782
struct inet_cork *cork,
783
int getfrag(void *from, char *to, int offset,
784
int len, int odd, struct sk_buff *skb),
785
void *from, int length, int transhdrlen,
800
788
struct inet_sock *inet = inet_sk(sk);
801
789
struct sk_buff *skb;
803
struct ip_options *opt = NULL;
791
struct ip_options *opt = cork->opt;
810
798
unsigned int maxfraglen, fragheaderlen;
811
799
int csummode = CHECKSUM_NONE;
817
if (skb_queue_empty(&sk->sk_write_queue)) {
823
if (inet->cork.opt == NULL) {
824
inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
825
if (unlikely(inet->cork.opt == NULL))
828
memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
829
inet->cork.flags |= IPCORK_OPT;
830
inet->cork.addr = ipc->addr;
836
* We steal reference to this route, caller should not release it
839
inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
841
dst_mtu(rt->dst.path);
842
inet->cork.dst = &rt->dst;
843
inet->cork.length = 0;
844
sk->sk_sndmsg_page = NULL;
845
sk->sk_sndmsg_off = 0;
846
exthdrlen = rt->dst.header_len;
848
transhdrlen += exthdrlen;
850
rt = (struct rtable *)inet->cork.dst;
851
if (inet->cork.flags & IPCORK_OPT)
852
opt = inet->cork.opt;
856
mtu = inet->cork.fragsize;
800
struct rtable *rt = (struct rtable *)cork->dst;
802
skb = skb_peek_tail(queue);
804
exthdrlen = !skb ? rt->dst.header_len : 0;
805
mtu = cork->fragsize;
858
807
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
860
809
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
861
810
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
863
if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
864
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
812
if (cork->length + length > 0xFFFF - fragheaderlen) {
813
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
866
815
return -EMSGSIZE;
877
826
csummode = CHECKSUM_PARTIAL;
879
skb = skb_peek_tail(&sk->sk_write_queue);
881
inet->cork.length += length;
828
cork->length += length;
882
829
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
883
830
(sk->sk_protocol == IPPROTO_UDP) &&
884
(rt->dst.dev->features & NETIF_F_UFO)) {
885
err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
886
fragheaderlen, transhdrlen, mtu,
831
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
832
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833
hh_len, fragheaderlen, transhdrlen,
935
882
alloclen = fraglen;
884
alloclen += exthdrlen;
937
886
/* The last fragment gets additional space at tail.
938
887
* Note, with MSG_MORE we overallocate on fragments,
939
888
* because we have no idea what fragment will be
942
if (datalen == length + fraggap) {
891
if (datalen == length + fraggap)
943
892
alloclen += rt->dst.trailer_len;
944
/* make sure mtu is not reached */
945
if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
946
datalen -= ALIGN(rt->dst.trailer_len, 8);
948
894
if (transhdrlen) {
949
895
skb = sock_alloc_send_skb(sk,
950
896
alloclen + hh_len + 15,
972
918
skb->ip_summed = csummode;
974
920
skb_reserve(skb, hh_len);
975
skb_shinfo(skb)->tx_flags = ipc->tx_flags;
921
skb_shinfo(skb)->tx_flags = cork->tx_flags;
978
924
* Find where to start putting bytes.
980
data = skb_put(skb, fraglen);
926
data = skb_put(skb, fraglen + exthdrlen);
981
927
skb_set_network_header(skb, exthdrlen);
982
928
skb->transport_header = (skb->network_header +
984
data += fragheaderlen;
930
data += fragheaderlen + exthdrlen;
987
933
skb->csum = skb_copy_and_csum_bits(
1083
inet->cork.length -= length;
1029
cork->length -= length;
1084
1030
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1088
ssize_t ip_append_page(struct sock *sk, struct page *page,
1034
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035
struct ipcm_cookie *ipc, struct rtable **rtp)
1037
struct inet_sock *inet = inet_sk(sk);
1038
struct ip_options_rcu *opt;
1042
* setup for corking.
1046
if (cork->opt == NULL) {
1047
cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1049
if (unlikely(cork->opt == NULL))
1052
memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053
cork->flags |= IPCORK_OPT;
1054
cork->addr = ipc->addr;
1060
* We steal reference to this route, caller should not release it
1063
cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064
rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065
cork->dst = &rt->dst;
1067
cork->tx_flags = ipc->tx_flags;
1075
* ip_append_data() and ip_append_page() can make one large IP datagram
1076
* from many pieces of data. Each pieces will be holded on the socket
1077
* until ip_push_pending_frames() is called. Each piece can be a page
1080
* Not only UDP, other transport protocols - e.g. raw sockets - can use
1081
* this interface potentially.
1083
* LATER: length must be adjusted by pad at tail, when it is required.
1085
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086
int getfrag(void *from, char *to, int offset, int len,
1087
int odd, struct sk_buff *skb),
1088
void *from, int length, int transhdrlen,
1089
struct ipcm_cookie *ipc, struct rtable **rtp,
1092
struct inet_sock *inet = inet_sk(sk);
1095
if (flags&MSG_PROBE)
1098
if (skb_queue_empty(&sk->sk_write_queue)) {
1099
err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1106
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107
from, length, transhdrlen, flags);
1110
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1089
1111
int offset, size_t size, int flags)
1091
1113
struct inet_sock *inet = inet_sk(sk);
1092
1114
struct sk_buff *skb;
1093
1115
struct rtable *rt;
1094
1116
struct ip_options *opt = NULL;
1117
struct inet_cork *cork;
1107
1130
if (skb_queue_empty(&sk->sk_write_queue))
1108
1131
return -EINVAL;
1110
rt = (struct rtable *)inet->cork.dst;
1111
if (inet->cork.flags & IPCORK_OPT)
1112
opt = inet->cork.opt;
1133
cork = &inet->cork.base;
1134
rt = (struct rtable *)cork->dst;
1135
if (cork->flags & IPCORK_OPT)
1114
1138
if (!(rt->dst.dev->features&NETIF_F_SG))
1115
1139
return -EOPNOTSUPP;
1117
1141
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1118
mtu = inet->cork.fragsize;
1142
mtu = cork->fragsize;
1120
1144
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1121
1145
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1123
if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1124
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1147
if (cork->length + size > 0xFFFF - fragheaderlen) {
1148
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1125
1149
return -EMSGSIZE;
1128
1152
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1129
1153
return -EINVAL;
1131
inet->cork.length += size;
1155
cork->length += size;
1132
1156
if ((size + skb->len > mtu) &&
1133
1157
(sk->sk_protocol == IPPROTO_UDP) &&
1134
1158
(rt->dst.dev->features & NETIF_F_UFO)) {
1226
inet->cork.length -= size;
1250
cork->length -= size;
1227
1251
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1231
static void ip_cork_release(struct inet_sock *inet)
1255
static void ip_cork_release(struct inet_cork *cork)
1233
inet->cork.flags &= ~IPCORK_OPT;
1234
kfree(inet->cork.opt);
1235
inet->cork.opt = NULL;
1236
dst_release(inet->cork.dst);
1237
inet->cork.dst = NULL;
1257
cork->flags &= ~IPCORK_OPT;
1260
dst_release(cork->dst);
1241
1265
* Combined all pending IP fragments on the socket as one IP datagram
1242
1266
* and push them out.
1244
int ip_push_pending_frames(struct sock *sk)
1268
struct sk_buff *__ip_make_skb(struct sock *sk,
1270
struct sk_buff_head *queue,
1271
struct inet_cork *cork)
1246
1273
struct sk_buff *skb, *tmp_skb;
1247
1274
struct sk_buff **tail_skb;
1248
1275
struct inet_sock *inet = inet_sk(sk);
1249
1276
struct net *net = sock_net(sk);
1250
1277
struct ip_options *opt = NULL;
1251
struct rtable *rt = (struct rtable *)inet->cork.dst;
1278
struct rtable *rt = (struct rtable *)cork->dst;
1252
1279
struct iphdr *iph;
1257
if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1283
if ((skb = __skb_dequeue(queue)) == NULL)
1259
1285
tail_skb = &(skb_shinfo(skb)->frag_list);
1261
1287
/* move skb->data to ip header from ext header */
1262
1288
if (skb->data < skb_network_header(skb))
1263
1289
__skb_pull(skb, skb_network_offset(skb));
1264
while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1290
while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1265
1291
__skb_pull(tmp_skb, skb_network_header_len(skb));
1266
1292
*tail_skb = tmp_skb;
1267
1293
tail_skb = &(tmp_skb->next);
1298
1324
iph = (struct iphdr *)skb->data;
1299
1325
iph->version = 4;
1302
iph->ihl += opt->optlen>>2;
1303
ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1305
1327
iph->tos = inet->tos;
1306
1328
iph->frag_off = df;
1307
1329
ip_select_ident(iph, &rt->dst, sk);
1308
1330
iph->ttl = ttl;
1309
1331
iph->protocol = sk->sk_protocol;
1310
iph->saddr = rt->rt_src;
1311
iph->daddr = rt->rt_dst;
1332
iph->saddr = fl4->saddr;
1333
iph->daddr = fl4->daddr;
1336
iph->ihl += opt->optlen>>2;
1337
ip_options_build(skb, opt, cork->addr, rt, 0);
1313
1340
skb->priority = sk->sk_priority;
1314
1341
skb->mark = sk->sk_mark;
1316
1343
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317
1344
* on dst refcount
1319
inet->cork.dst = NULL;
1320
1347
skb_dst_set(skb, &rt->dst);
1322
1349
if (iph->protocol == IPPROTO_ICMP)
1323
1350
icmp_out_count(net, ((struct icmphdr *)
1324
1351
skb_transport_header(skb))->type);
1326
/* Netfilter gets whole the not fragmented skb. */
1353
ip_cork_release(cork);
1358
int ip_send_skb(struct sk_buff *skb)
1360
struct net *net = sock_net(skb->sk);
1327
1363
err = ip_local_out(skb);
1330
1366
err = net_xmit_errno(err);
1368
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1336
ip_cork_release(inet);
1340
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1374
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1376
struct sk_buff *skb;
1378
skb = ip_finish_skb(sk, fl4);
1382
/* Netfilter gets whole the not fragmented skb. */
1383
return ip_send_skb(skb);
1345
1387
* Throw away all pending data on the socket.
1389
static void __ip_flush_pending_frames(struct sock *sk,
1390
struct sk_buff_head *queue,
1391
struct inet_cork *cork)
1393
struct sk_buff *skb;
1395
while ((skb = __skb_dequeue_tail(queue)) != NULL)
1398
ip_cork_release(cork);
1347
1401
void ip_flush_pending_frames(struct sock *sk)
1349
struct sk_buff *skb;
1351
while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1354
ip_cork_release(inet_sk(sk));
1403
__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1406
struct sk_buff *ip_make_skb(struct sock *sk,
1408
int getfrag(void *from, char *to, int offset,
1409
int len, int odd, struct sk_buff *skb),
1410
void *from, int length, int transhdrlen,
1411
struct ipcm_cookie *ipc, struct rtable **rtp,
1414
struct inet_cork cork;
1415
struct sk_buff_head queue;
1418
if (flags & MSG_PROBE)
1421
__skb_queue_head_init(&queue);
1426
err = ip_setup_cork(sk, &cork, ipc, rtp);
1428
return ERR_PTR(err);
1430
err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431
from, length, transhdrlen, flags);
1433
__ip_flush_pending_frames(sk, &queue, &cork);
1434
return ERR_PTR(err);
1437
return __ip_make_skb(sk, fl4, &queue, &cork);
1359
1441
* Fetch data from kernel space and fill in checksum if needed.
1375
1457
* Should run single threaded per socket because it uses the sock
1376
1458
* structure to pass arguments.
1378
void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1460
void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1461
struct ip_reply_arg *arg, unsigned int len)
1381
1463
struct inet_sock *inet = inet_sk(sk);
1383
struct ip_options opt;
1464
struct ip_options_data replyopts;
1386
1465
struct ipcm_cookie ipc;
1388
1467
struct rtable *rt = skb_rtable(skb);
1390
if (ip_options_echo(&replyopts.opt, skb))
1469
if (ip_options_echo(&replyopts.opt.opt, skb))
1393
daddr = ipc.addr = rt->rt_src;
1394
1473
ipc.opt = NULL;
1395
1474
ipc.tx_flags = 0;
1397
if (replyopts.opt.optlen) {
1476
if (replyopts.opt.opt.optlen) {
1398
1477
ipc.opt = &replyopts.opt;
1401
daddr = replyopts.opt.faddr;
1479
if (replyopts.opt.opt.srr)
1480
daddr = replyopts.opt.opt.faddr;
1405
struct flowi fl = { .oif = arg->bound_dev_if,
1407
.fl4_src = rt->rt_spec_dst,
1408
.fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1409
.fl_ip_sport = tcp_hdr(skb)->dest,
1410
.fl_ip_dport = tcp_hdr(skb)->source,
1411
.proto = sk->sk_protocol,
1412
.flags = ip_reply_arg_flowi_flags(arg) };
1413
security_skb_classify_flow(skb, &fl);
1414
if (ip_route_output_key(sock_net(sk), &rt, &fl))
1483
flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1484
RT_TOS(ip_hdr(skb)->tos),
1485
RT_SCOPE_UNIVERSE, sk->sk_protocol,
1486
ip_reply_arg_flowi_flags(arg),
1487
daddr, rt->rt_spec_dst,
1488
tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1489
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1490
rt = ip_route_output_key(sock_net(sk), &fl4);
1418
1494
/* And let IP do all the hard work.
1426
1502
sk->sk_priority = skb->priority;
1427
1503
sk->sk_protocol = ip_hdr(skb)->protocol;
1428
1504
sk->sk_bound_dev_if = arg->bound_dev_if;
1429
ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1505
ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1430
1506
&ipc, &rt, MSG_DONTWAIT);
1431
1507
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1432
1508
if (arg->csumoffset >= 0)