2
* Copyright (c) 2009, 2010, 2011 Nicira Networks.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at:
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
19
#include "netdev-linux.h"
24
#include <arpa/inet.h>
26
#include <linux/gen_stats.h>
27
#include <linux/if_ether.h>
28
#include <linux/if_tun.h>
30
#include <linux/types.h>
31
#include <linux/ethtool.h>
32
#include <linux/mii.h>
33
#include <linux/pkt_sched.h>
34
#include <linux/rtnetlink.h>
35
#include <linux/sockios.h>
36
#include <linux/version.h>
37
#include <sys/types.h>
38
#include <sys/ioctl.h>
39
#include <sys/socket.h>
40
#include <netpacket/packet.h>
42
#include <net/if_arp.h>
43
#include <net/if_packet.h>
44
#include <net/route.h>
45
#include <netinet/in.h>
52
#include "dpif-linux.h"
53
#include "dynamic-string.h"
54
#include "fatal-signal.h"
57
#include "netdev-provider.h"
58
#include "netdev-vport.h"
60
#include "netlink-notifier.h"
61
#include "netlink-socket.h"
63
#include "openflow/openflow.h"
65
#include "poll-loop.h"
66
#include "rtnetlink-link.h"
67
#include "socket-util.h"
73
VLOG_DEFINE_THIS_MODULE(netdev_linux);
75
COVERAGE_DEFINE(netdev_set_policing);
76
COVERAGE_DEFINE(netdev_arp_lookup);
77
COVERAGE_DEFINE(netdev_get_ifindex);
78
COVERAGE_DEFINE(netdev_get_hwaddr);
79
COVERAGE_DEFINE(netdev_set_hwaddr);
80
COVERAGE_DEFINE(netdev_ethtool);
82
/* These were introduced in Linux 2.6.14, so they might be missing if we have
84
#ifndef ADVERTISED_Pause
85
#define ADVERTISED_Pause (1 << 13)
87
#ifndef ADVERTISED_Asym_Pause
88
#define ADVERTISED_Asym_Pause (1 << 14)
91
/* These were introduced in Linux 2.6.24, so they might be missing if we
92
* have old headers. */
93
#ifndef ETHTOOL_GFLAGS
94
#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
96
#ifndef ETHTOOL_SFLAGS
97
#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
100
/* This was introduced in Linux 2.6.25, so it might be missing if we have old
103
#define TC_RTAB_SIZE 1024
106
static struct nln_notifier *netdev_linux_cache_notifier = NULL;
107
static int cache_notifier_refcount;
110
VALID_IFINDEX = 1 << 0,
111
VALID_ETHERADDR = 1 << 1,
115
VALID_POLICING = 1 << 5,
116
VALID_HAVE_VPORT_STATS = 1 << 6
124
/* Traffic control. */
126
/* An instance of a traffic control class. Always associated with a particular
129
* Each TC implementation subclasses this with whatever additional data it
132
const struct tc_ops *ops;
133
struct hmap queues; /* Contains "struct tc_queue"s.
134
* Read by generic TC layer.
135
* Written only by TC implementation. */
138
/* One traffic control queue.
140
* Each TC implementation subclasses this with whatever additional data it
143
struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
144
unsigned int queue_id; /* OpenFlow queue ID. */
147
/* A particular kind of traffic control. Each implementation generally maps to
148
* one particular Linux qdisc class.
150
* The functions below return 0 if successful or a positive errno value on
151
* failure, except where otherwise noted. All of them must be provided, except
152
* where otherwise noted. */
154
/* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
155
* This is null for tc_ops_default and tc_ops_other, for which there are no
156
* appropriate values. */
157
const char *linux_name;
159
/* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
160
const char *ovs_name;
162
/* Number of supported OpenFlow queues, 0 for qdiscs that have no
163
* queues. The queues are numbered 0 through n_queues - 1. */
164
unsigned int n_queues;
166
/* Called to install this TC class on 'netdev'. The implementation should
167
* make the Netlink calls required to set up 'netdev' with the right qdisc
168
* and configure it according to 'details'. The implementation may assume
169
* that the current qdisc is the default; that is, there is no need for it
170
* to delete the current qdisc before installing itself.
172
* The contents of 'details' should be documented as valid for 'ovs_name'
173
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
174
* (which is built as ovs-vswitchd.conf.db(8)).
176
* This function must return 0 if and only if it sets 'netdev->tc' to an
177
* initialized 'struct tc'.
179
* (This function is null for tc_ops_other, which cannot be installed. For
180
* other TC classes it should always be nonnull.) */
181
int (*tc_install)(struct netdev *netdev, const struct shash *details);
183
/* Called when the netdev code determines (through a Netlink query) that
184
* this TC class's qdisc is installed on 'netdev', but we didn't install
185
* it ourselves and so don't know any of the details.
187
* 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
188
* 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
189
* implementation should parse the other attributes of 'nlmsg' as
190
* necessary to determine its configuration. If necessary it should also
191
* use Netlink queries to determine the configuration of queues on
194
* This function must return 0 if and only if it sets 'netdev->tc' to an
195
* initialized 'struct tc'. */
196
int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
198
/* Destroys the data structures allocated by the implementation as part of
199
* 'tc'. (This includes destroying 'tc->queues' by calling
202
* The implementation should not need to perform any Netlink calls. If
203
* desirable, the caller is responsible for deconfiguring the kernel qdisc.
204
* (But it may not be desirable.)
206
* This function may be null if 'tc' is trivial. */
207
void (*tc_destroy)(struct tc *tc);
209
/* Retrieves details of 'netdev->tc' configuration into 'details'.
211
* The implementation should not need to perform any Netlink calls, because
212
* the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
213
* cached the configuration.
215
* The contents of 'details' should be documented as valid for 'ovs_name'
216
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
217
* (which is built as ovs-vswitchd.conf.db(8)).
219
* This function may be null if 'tc' is not configurable.
221
int (*qdisc_get)(const struct netdev *netdev, struct shash *details);
223
/* Reconfigures 'netdev->tc' according to 'details', performing any
224
* required Netlink calls to complete the reconfiguration.
226
* The contents of 'details' should be documented as valid for 'ovs_name'
227
* in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
228
* (which is built as ovs-vswitchd.conf.db(8)).
230
* This function may be null if 'tc' is not configurable.
232
int (*qdisc_set)(struct netdev *, const struct shash *details);
234
/* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
235
* one of the 'struct tc_queue's within 'netdev->tc->queues'.
237
* The contents of 'details' should be documented as valid for 'ovs_name'
238
* in the "other_config" column in the "Queue" table in
239
* vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
241
* The implementation should not need to perform any Netlink calls, because
242
* the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
243
* cached the queue configuration.
245
* This function may be null if 'tc' does not have queues ('n_queues' is
247
int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
248
struct shash *details);
250
/* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
251
* 'details', perfoming any required Netlink calls to complete the
252
* reconfiguration. The caller ensures that 'queue_id' is less than
255
* The contents of 'details' should be documented as valid for 'ovs_name'
256
* in the "other_config" column in the "Queue" table in
257
* vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
259
* This function may be null if 'tc' does not have queues or its queues are
260
* not configurable. */
261
int (*class_set)(struct netdev *, unsigned int queue_id,
262
const struct shash *details);
264
/* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
265
* tc_queue's within 'netdev->tc->queues'.
267
* This function may be null if 'tc' does not have queues or its queues
268
* cannot be deleted. */
269
int (*class_delete)(struct netdev *, struct tc_queue *queue);
271
/* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
272
* 'struct tc_queue's within 'netdev->tc->queues'.
274
* On success, initializes '*stats'.
276
* This function may be null if 'tc' does not have queues or if it cannot
277
* report queue statistics. */
278
int (*class_get_stats)(const struct netdev *netdev,
279
const struct tc_queue *queue,
280
struct netdev_queue_stats *stats);
282
/* Extracts queue stats from 'nlmsg', which is a response to a
283
* RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
285
* This function may be null if 'tc' does not have queues or if it cannot
286
* report queue statistics. */
287
int (*class_dump_stats)(const struct netdev *netdev,
288
const struct ofpbuf *nlmsg,
289
netdev_dump_queue_stats_cb *cb, void *aux);
293
tc_init(struct tc *tc, const struct tc_ops *ops)
296
hmap_init(&tc->queues);
300
tc_destroy(struct tc *tc)
302
hmap_destroy(&tc->queues);
305
static const struct tc_ops tc_ops_htb;
306
static const struct tc_ops tc_ops_hfsc;
307
static const struct tc_ops tc_ops_default;
308
static const struct tc_ops tc_ops_other;
310
static const struct tc_ops *tcs[] = {
311
&tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
312
&tc_ops_hfsc, /* Hierarchical fair service curve. */
313
&tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
314
&tc_ops_other, /* Some other qdisc. */
318
static unsigned int tc_make_handle(unsigned int major, unsigned int minor);
319
static unsigned int tc_get_major(unsigned int handle);
320
static unsigned int tc_get_minor(unsigned int handle);
322
static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
323
static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
324
static unsigned int tc_buffer_per_jiffy(unsigned int rate);
326
static struct tcmsg *tc_make_request(const struct netdev *, int type,
327
unsigned int flags, struct ofpbuf *);
328
static int tc_transact(struct ofpbuf *request, struct ofpbuf **replyp);
330
static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
331
struct nlattr **options);
332
static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
333
struct nlattr **options,
334
struct netdev_queue_stats *);
335
static int tc_query_class(const struct netdev *,
336
unsigned int handle, unsigned int parent,
337
struct ofpbuf **replyp);
338
static int tc_delete_class(const struct netdev *, unsigned int handle);
340
static int tc_del_qdisc(struct netdev *netdev);
341
static int tc_query_qdisc(const struct netdev *netdev);
343
static int tc_calc_cell_log(unsigned int mtu);
344
static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
345
static void tc_put_rtab(struct ofpbuf *, uint16_t type,
346
const struct tc_ratespec *rate);
347
static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
349
struct netdev_dev_linux {
350
struct netdev_dev netdev_dev;
352
struct shash_node *shash_node;
353
unsigned int cache_valid;
354
unsigned int change_seq;
356
bool miimon; /* Link status of last poll. */
357
long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
358
struct timer miimon_timer;
360
/* The following are figured out "on demand" only. They are only valid
361
* when the corresponding VALID_* bit in 'cache_valid' is set. */
363
uint8_t etheraddr[ETH_ADDR_LEN];
364
struct in_addr address, netmask;
368
long long int carrier_resets;
369
uint32_t kbits_rate; /* Policing data. */
370
uint32_t kbits_burst;
371
bool have_vport_stats;
375
struct tap_state tap;
379
struct netdev_linux {
380
struct netdev netdev;
384
/* Sockets used for ioctl operations. */
385
static int af_inet_sock = -1; /* AF_INET, SOCK_DGRAM. */
387
/* A Netlink routing socket that is not subscribed to any multicast groups. */
388
static struct nl_sock *rtnl_sock;
390
/* This is set pretty low because we probably won't learn anything from the
391
* additional log messages. */
392
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
394
static int netdev_linux_init(void);
396
static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
397
int cmd, const char *cmd_name);
398
static int netdev_linux_do_ioctl(const char *name, struct ifreq *, int cmd,
399
const char *cmd_name);
400
static int netdev_linux_get_ipv4(const struct netdev *, struct in_addr *,
401
int cmd, const char *cmd_name);
402
static int get_flags(const struct netdev *, int *flagsp);
403
static int set_flags(struct netdev *, int flags);
404
static int do_get_ifindex(const char *netdev_name);
405
static int get_ifindex(const struct netdev *, int *ifindexp);
406
static int do_set_addr(struct netdev *netdev,
407
int ioctl_nr, const char *ioctl_name,
408
struct in_addr addr);
409
static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN]);
410
static int set_etheraddr(const char *netdev_name, int hwaddr_family,
411
const uint8_t[ETH_ADDR_LEN]);
412
static int get_stats_via_netlink(int ifindex, struct netdev_stats *stats);
413
static int get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats);
414
static int get_carrier_via_sysfs(const char *name, bool *carrier);
415
static int af_packet_sock(void);
416
static void netdev_linux_miimon_run(void);
417
static void netdev_linux_miimon_wait(void);
420
is_netdev_linux_class(const struct netdev_class *netdev_class)
422
return netdev_class->init == netdev_linux_init;
425
static struct netdev_dev_linux *
426
netdev_dev_linux_cast(const struct netdev_dev *netdev_dev)
428
const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
429
assert(is_netdev_linux_class(netdev_class));
431
return CONTAINER_OF(netdev_dev, struct netdev_dev_linux, netdev_dev);
434
static struct netdev_linux *
435
netdev_linux_cast(const struct netdev *netdev)
437
struct netdev_dev *netdev_dev = netdev_get_dev(netdev);
438
const struct netdev_class *netdev_class = netdev_dev_get_class(netdev_dev);
439
assert(is_netdev_linux_class(netdev_class));
441
return CONTAINER_OF(netdev, struct netdev_linux, netdev);
445
netdev_linux_init(void)
447
static int status = -1;
449
/* Create AF_INET socket. */
450
af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
451
status = af_inet_sock >= 0 ? 0 : errno;
453
VLOG_ERR("failed to create inet socket: %s", strerror(status));
456
/* Create rtnetlink socket. */
458
status = nl_sock_create(NETLINK_ROUTE, &rtnl_sock);
460
VLOG_ERR_RL(&rl, "failed to create rtnetlink socket: %s",
469
netdev_linux_run(void)
471
rtnetlink_link_run();
472
netdev_linux_miimon_run();
476
netdev_linux_wait(void)
478
rtnetlink_link_wait();
479
netdev_linux_miimon_wait();
483
netdev_dev_linux_changed(struct netdev_dev_linux *dev)
486
if (!dev->change_seq) {
489
dev->cache_valid = 0;
493
netdev_linux_cache_cb(const struct rtnetlink_link_change *change,
494
void *aux OVS_UNUSED)
496
struct netdev_dev_linux *dev;
498
struct netdev_dev *base_dev = netdev_dev_from_name(change->ifname);
500
const struct netdev_class *netdev_class =
501
netdev_dev_get_class(base_dev);
503
if (is_netdev_linux_class(netdev_class)) {
504
dev = netdev_dev_linux_cast(base_dev);
506
if (dev->carrier != change->running) {
507
dev->carrier = change->running;
508
dev->carrier_resets++;
511
netdev_dev_linux_changed(dev);
515
struct shash device_shash;
516
struct shash_node *node;
518
shash_init(&device_shash);
519
netdev_dev_get_devices(&netdev_linux_class, &device_shash);
520
SHASH_FOR_EACH (node, &device_shash) {
525
get_carrier_via_sysfs(node->name, &carrier);
526
if (dev->carrier != carrier) {
527
dev->carrier = carrier;
528
dev->carrier_resets++;
531
netdev_dev_linux_changed(dev);
533
shash_destroy(&device_shash);
537
/* Creates system and internal devices. */
539
netdev_linux_create(const struct netdev_class *class, const char *name,
540
struct netdev_dev **netdev_devp)
542
struct netdev_dev_linux *netdev_dev;
544
if (!cache_notifier_refcount) {
545
assert(!netdev_linux_cache_notifier);
547
netdev_linux_cache_notifier =
548
rtnetlink_link_notifier_create(netdev_linux_cache_cb, NULL);
550
if (!netdev_linux_cache_notifier) {
554
cache_notifier_refcount++;
556
netdev_dev = xzalloc(sizeof *netdev_dev);
557
netdev_dev->change_seq = 1;
558
netdev_dev_init(&netdev_dev->netdev_dev, name, class);
559
get_carrier_via_sysfs(name, &netdev_dev->carrier);
561
*netdev_devp = &netdev_dev->netdev_dev;
565
/* For most types of netdevs we open the device for each call of
566
* netdev_open(). However, this is not the case with tap devices,
567
* since it is only possible to open the device once. In this
568
* situation we share a single file descriptor, and consequently
569
* buffers, across all readers. Therefore once data is read it will
570
* be unavailable to other reads for tap devices. */
572
netdev_linux_create_tap(const struct netdev_class *class OVS_UNUSED,
573
const char *name, struct netdev_dev **netdev_devp)
575
struct netdev_dev_linux *netdev_dev;
576
struct tap_state *state;
577
static const char tap_dev[] = "/dev/net/tun";
581
netdev_dev = xzalloc(sizeof *netdev_dev);
582
state = &netdev_dev->state.tap;
584
/* Open tap device. */
585
state->fd = open(tap_dev, O_RDWR);
588
VLOG_WARN("opening \"%s\" failed: %s", tap_dev, strerror(error));
592
/* Create tap device. */
593
ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
594
ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
595
if (ioctl(state->fd, TUNSETIFF, &ifr) == -1) {
596
VLOG_WARN("%s: creating tap device failed: %s", name,
602
/* Make non-blocking. */
603
error = set_nonblocking(state->fd);
608
netdev_dev_init(&netdev_dev->netdev_dev, name, &netdev_tap_class);
609
*netdev_devp = &netdev_dev->netdev_dev;
618
destroy_tap(struct netdev_dev_linux *netdev_dev)
620
struct tap_state *state = &netdev_dev->state.tap;
622
if (state->fd >= 0) {
627
/* Destroys the netdev device 'netdev_dev_'. */
629
netdev_linux_destroy(struct netdev_dev *netdev_dev_)
631
struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
632
const struct netdev_class *class = netdev_dev_get_class(netdev_dev_);
634
if (netdev_dev->tc && netdev_dev->tc->ops->tc_destroy) {
635
netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
638
if (class == &netdev_linux_class || class == &netdev_internal_class) {
639
cache_notifier_refcount--;
641
if (!cache_notifier_refcount) {
642
assert(netdev_linux_cache_notifier);
643
rtnetlink_link_notifier_destroy(netdev_linux_cache_notifier);
644
netdev_linux_cache_notifier = NULL;
646
} else if (class == &netdev_tap_class) {
647
destroy_tap(netdev_dev);
656
netdev_linux_open(struct netdev_dev *netdev_dev_, struct netdev **netdevp)
658
struct netdev_dev_linux *netdev_dev = netdev_dev_linux_cast(netdev_dev_);
659
struct netdev_linux *netdev;
660
enum netdev_flags flags;
663
/* Allocate network device. */
664
netdev = xzalloc(sizeof *netdev);
666
netdev_init(&netdev->netdev, netdev_dev_);
668
/* Verify that the device really exists, by attempting to read its flags.
669
* (The flags might be cached, in which case this won't actually do an
672
* Don't do this for "internal" netdevs, though, because those have to be
673
* created as netdev objects before they exist in the kernel, because
674
* creating them in the kernel happens by passing a netdev object to
675
* dpif_port_add(). */
676
if (netdev_dev_get_class(netdev_dev_) != &netdev_internal_class) {
677
error = netdev_get_flags(&netdev->netdev, &flags);
678
if (error == ENODEV) {
683
if (!strcmp(netdev_dev_get_type(netdev_dev_), "tap") &&
684
!netdev_dev->state.tap.opened) {
686
/* We assume that the first user of the tap device is the primary user
687
* and give them the tap FD. Subsequent users probably just expect
688
* this to be a system device so open it normally to avoid send/receive
689
* directions appearing to be reversed. */
690
netdev->fd = netdev_dev->state.tap.fd;
691
netdev_dev->state.tap.opened = true;
694
*netdevp = &netdev->netdev;
698
netdev_uninit(&netdev->netdev, true);
702
/* Closes and destroys 'netdev'. */
704
netdev_linux_close(struct netdev *netdev_)
706
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
708
if (netdev->fd > 0 && strcmp(netdev_get_type(netdev_), "tap")) {
715
netdev_linux_listen(struct netdev *netdev_)
717
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
718
struct sockaddr_ll sll;
723
if (netdev->fd >= 0) {
727
/* Create file descriptor. */
728
fd = socket(PF_PACKET, SOCK_RAW, 0);
731
VLOG_ERR("failed to create raw socket (%s)", strerror(error));
735
/* Set non-blocking mode. */
736
error = set_nonblocking(fd);
741
/* Get ethernet device index. */
742
error = get_ifindex(&netdev->netdev, &ifindex);
747
/* Bind to specific ethernet device. */
748
memset(&sll, 0, sizeof sll);
749
sll.sll_family = AF_PACKET;
750
sll.sll_ifindex = ifindex;
751
sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL);
752
if (bind(fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
754
VLOG_ERR("%s: failed to bind raw socket (%s)",
755
netdev_get_name(netdev_), strerror(error));
770
netdev_linux_recv(struct netdev *netdev_, void *data, size_t size)
772
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
774
if (netdev->fd < 0) {
775
/* Device is not listening. */
780
ssize_t retval = read(netdev->fd, data, size);
783
} else if (errno != EINTR) {
784
if (errno != EAGAIN) {
785
VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
786
strerror(errno), netdev_get_name(netdev_));
793
/* Registers with the poll loop to wake up from the next call to poll_block()
794
* when a packet is ready to be received with netdev_recv() on 'netdev'. */
796
netdev_linux_recv_wait(struct netdev *netdev_)
798
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
799
if (netdev->fd >= 0) {
800
poll_fd_wait(netdev->fd, POLLIN);
804
/* Discards all packets waiting to be received from 'netdev'. */
806
netdev_linux_drain(struct netdev *netdev_)
808
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
809
if (netdev->fd < 0) {
811
} else if (!strcmp(netdev_get_type(netdev_), "tap")) {
813
int error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
814
SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
818
drain_fd(netdev->fd, ifr.ifr_qlen);
821
return drain_rcvbuf(netdev->fd);
825
/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
826
* errno value. Returns EAGAIN without blocking if the packet cannot be queued
827
* immediately. Returns EMSGSIZE if a partial packet was transmitted or if
828
* the packet is too big or too small to transmit on the device.
830
* The caller retains ownership of 'buffer' in all cases.
832
* The kernel maintains a packet transmission queue, so the caller is not
833
* expected to do additional queuing of packets. */
835
netdev_linux_send(struct netdev *netdev_, const void *data, size_t size)
837
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
841
if (netdev->fd < 0) {
842
/* Use our AF_PACKET socket to send to this device. */
843
struct sockaddr_ll sll;
850
sock = af_packet_sock();
855
error = get_ifindex(netdev_, &ifindex);
860
/* We don't bother setting most fields in sockaddr_ll because the
861
* kernel ignores them for SOCK_RAW. */
862
memset(&sll, 0, sizeof sll);
863
sll.sll_family = AF_PACKET;
864
sll.sll_ifindex = ifindex;
866
iov.iov_base = (void *) data;
870
msg.msg_namelen = sizeof sll;
873
msg.msg_control = NULL;
874
msg.msg_controllen = 0;
877
retval = sendmsg(sock, &msg, 0);
879
/* Use the netdev's own fd to send to this device. This is
880
* essential for tap devices, because packets sent to a tap device
881
* with an AF_PACKET socket will loop back to be *received* again
882
* on the tap device. */
883
retval = write(netdev->fd, data, size);
887
/* The Linux AF_PACKET implementation never blocks waiting for room
888
* for packets, instead returning ENOBUFS. Translate this into
889
* EAGAIN for the caller. */
890
if (errno == ENOBUFS) {
892
} else if (errno == EINTR) {
894
} else if (errno != EAGAIN) {
895
VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
896
netdev_get_name(netdev_), strerror(errno));
899
} else if (retval != size) {
900
VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%zd bytes of "
901
"%zu) on %s", retval, size, netdev_get_name(netdev_));
909
/* Registers with the poll loop to wake up from the next call to poll_block()
910
* when the packet transmission queue has sufficient room to transmit a packet
911
* with netdev_send().
913
* The kernel maintains a packet transmission queue, so the client is not
914
* expected to do additional queuing of packets. Thus, this function is
915
* unlikely to ever be used. It is included for completeness. */
917
netdev_linux_send_wait(struct netdev *netdev_)
919
struct netdev_linux *netdev = netdev_linux_cast(netdev_);
920
if (netdev->fd < 0) {
922
} else if (strcmp(netdev_get_type(netdev_), "tap")) {
923
poll_fd_wait(netdev->fd, POLLOUT);
925
/* TAP device always accepts packets.*/
926
poll_immediate_wake();
930
/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
931
* otherwise a positive errno value. */
933
netdev_linux_set_etheraddr(struct netdev *netdev_,
934
const uint8_t mac[ETH_ADDR_LEN])
936
struct netdev_dev_linux *netdev_dev =
937
netdev_dev_linux_cast(netdev_get_dev(netdev_));
940
if (!(netdev_dev->cache_valid & VALID_ETHERADDR)
941
|| !eth_addr_equals(netdev_dev->etheraddr, mac)) {
942
error = set_etheraddr(netdev_get_name(netdev_), ARPHRD_ETHER, mac);
944
netdev_dev->cache_valid |= VALID_ETHERADDR;
945
memcpy(netdev_dev->etheraddr, mac, ETH_ADDR_LEN);
953
/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
954
* free the returned buffer. */
956
netdev_linux_get_etheraddr(const struct netdev *netdev_,
957
uint8_t mac[ETH_ADDR_LEN])
959
struct netdev_dev_linux *netdev_dev =
960
netdev_dev_linux_cast(netdev_get_dev(netdev_));
961
if (!(netdev_dev->cache_valid & VALID_ETHERADDR)) {
962
int error = get_etheraddr(netdev_get_name(netdev_),
963
netdev_dev->etheraddr);
967
netdev_dev->cache_valid |= VALID_ETHERADDR;
969
memcpy(mac, netdev_dev->etheraddr, ETH_ADDR_LEN);
973
/* Returns the maximum size of transmitted (and received) packets on 'netdev',
974
* in bytes, not including the hardware header; thus, this is typically 1500
975
* bytes for Ethernet devices. */
977
netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
979
struct netdev_dev_linux *netdev_dev =
980
netdev_dev_linux_cast(netdev_get_dev(netdev_));
981
if (!(netdev_dev->cache_valid & VALID_MTU)) {
985
error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
986
SIOCGIFMTU, "SIOCGIFMTU");
990
netdev_dev->mtu = ifr.ifr_mtu;
991
netdev_dev->cache_valid |= VALID_MTU;
993
*mtup = netdev_dev->mtu;
997
/* Sets the maximum size of transmitted (MTU) for given device using linux
998
* networking ioctl interface.
1001
netdev_linux_set_mtu(const struct netdev *netdev_, int mtu)
1003
struct netdev_dev_linux *netdev_dev =
1004
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1009
error = netdev_linux_do_ioctl(netdev_get_name(netdev_), &ifr,
1010
SIOCSIFMTU, "SIOCSIFMTU");
1015
netdev_dev->mtu = ifr.ifr_mtu;
1016
netdev_dev->cache_valid |= VALID_MTU;
1020
/* Returns the ifindex of 'netdev', if successful, as a positive number.
1021
* On failure, returns a negative errno value. */
1023
netdev_linux_get_ifindex(const struct netdev *netdev)
1027
error = get_ifindex(netdev, &ifindex);
1028
return error ? -error : ifindex;
1032
netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1034
struct netdev_dev_linux *netdev_dev =
1035
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1037
if (netdev_dev->miimon_interval > 0) {
1038
*carrier = netdev_dev->miimon;
1040
*carrier = netdev_dev->carrier;
1046
static long long int
1047
netdev_linux_get_carrier_resets(const struct netdev *netdev)
1049
return netdev_dev_linux_cast(netdev_get_dev(netdev))->carrier_resets;
1053
netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1054
struct mii_ioctl_data *data)
1059
memset(&ifr, 0, sizeof ifr);
1060
memcpy(&ifr.ifr_data, data, sizeof *data);
1061
error = netdev_linux_do_ioctl(name, &ifr, cmd, cmd_name);
1062
memcpy(data, &ifr.ifr_data, sizeof *data);
1068
netdev_linux_get_miimon(const char *name, bool *miimon)
1070
struct mii_ioctl_data data;
1075
memset(&data, 0, sizeof data);
1076
error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
1078
/* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
1079
data.reg_num = MII_BMSR;
1080
error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
1084
*miimon = !!(data.val_out & BMSR_LSTATUS);
1086
VLOG_WARN_RL(&rl, "%s: failed to query MII", name);
1089
struct ethtool_cmd ecmd;
1091
VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
1094
memset(&ecmd, 0, sizeof ecmd);
1095
error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
1098
struct ethtool_value eval;
1100
memcpy(&eval, &ecmd, sizeof eval);
1101
*miimon = !!eval.data;
1103
VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
1111
netdev_linux_set_miimon_interval(struct netdev *netdev_,
1112
long long int interval)
1114
struct netdev_dev_linux *netdev_dev;
1116
netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev_));
1118
interval = interval > 0 ? MAX(interval, 100) : 0;
1119
if (netdev_dev->miimon_interval != interval) {
1120
netdev_dev->miimon_interval = interval;
1121
timer_set_expired(&netdev_dev->miimon_timer);
1128
netdev_linux_miimon_run(void)
1130
struct shash device_shash;
1131
struct shash_node *node;
1133
shash_init(&device_shash);
1134
netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1135
SHASH_FOR_EACH (node, &device_shash) {
1136
struct netdev_dev_linux *dev = node->data;
1139
if (dev->miimon_interval <= 0 || !timer_expired(&dev->miimon_timer)) {
1143
netdev_linux_get_miimon(dev->netdev_dev.name, &miimon);
1144
if (miimon != dev->miimon) {
1145
dev->miimon = miimon;
1146
netdev_dev_linux_changed(dev);
1149
timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1152
shash_destroy(&device_shash);
1156
netdev_linux_miimon_wait(void)
1158
struct shash device_shash;
1159
struct shash_node *node;
1161
shash_init(&device_shash);
1162
netdev_dev_get_devices(&netdev_linux_class, &device_shash);
1163
SHASH_FOR_EACH (node, &device_shash) {
1164
struct netdev_dev_linux *dev = node->data;
1166
if (dev->miimon_interval > 0) {
1167
timer_wait(&dev->miimon_timer);
1170
shash_destroy(&device_shash);
1173
/* Check whether we can we use RTM_GETLINK to get network device statistics.
1174
* In pre-2.6.19 kernels, this was only available if wireless extensions were
1177
check_for_working_netlink_stats(void)
1179
/* Decide on the netdev_get_stats() implementation to use. Netlink is
1180
* preferable, so if that works, we'll use it. */
1181
int ifindex = do_get_ifindex("lo");
1183
VLOG_WARN("failed to get ifindex for lo, "
1184
"obtaining netdev stats from proc");
1187
struct netdev_stats stats;
1188
int error = get_stats_via_netlink(ifindex, &stats);
1190
VLOG_DBG("obtaining netdev stats via rtnetlink");
1193
VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
1194
"via proc (you are probably running a pre-2.6.19 "
1195
"kernel)", strerror(error));
1202
swap_uint64(uint64_t *a, uint64_t *b)
1210
get_stats_via_vport(const struct netdev *netdev_,
1211
struct netdev_stats *stats)
1213
struct netdev_dev_linux *netdev_dev =
1214
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1216
if (netdev_dev->have_vport_stats ||
1217
!(netdev_dev->cache_valid & VALID_HAVE_VPORT_STATS)) {
1220
error = netdev_vport_get_stats(netdev_, stats);
1222
VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed %d",
1223
netdev_get_name(netdev_), error);
1225
netdev_dev->have_vport_stats = !error;
1226
netdev_dev->cache_valid |= VALID_HAVE_VPORT_STATS;
1231
netdev_linux_sys_get_stats(const struct netdev *netdev_,
1232
struct netdev_stats *stats)
1234
static int use_netlink_stats = -1;
1237
if (use_netlink_stats < 0) {
1238
use_netlink_stats = check_for_working_netlink_stats();
1241
if (use_netlink_stats) {
1244
error = get_ifindex(netdev_, &ifindex);
1246
error = get_stats_via_netlink(ifindex, stats);
1249
error = get_stats_via_proc(netdev_get_name(netdev_), stats);
1253
VLOG_WARN_RL(&rl, "%s: linux-sys get stats failed %d",
1254
netdev_get_name(netdev_), error);
1260
/* Retrieves current device stats for 'netdev-linux'. */
1262
netdev_linux_get_stats(const struct netdev *netdev_,
1263
struct netdev_stats *stats)
1265
struct netdev_dev_linux *netdev_dev =
1266
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1267
struct netdev_stats dev_stats;
1270
get_stats_via_vport(netdev_, stats);
1272
error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1275
if (!netdev_dev->have_vport_stats) {
1282
if (!netdev_dev->have_vport_stats) {
1283
/* stats not available from OVS then use ioctl stats. */
1286
stats->rx_errors += dev_stats.rx_errors;
1287
stats->tx_errors += dev_stats.tx_errors;
1288
stats->rx_dropped += dev_stats.rx_dropped;
1289
stats->tx_dropped += dev_stats.tx_dropped;
1290
stats->multicast += dev_stats.multicast;
1291
stats->collisions += dev_stats.collisions;
1292
stats->rx_length_errors += dev_stats.rx_length_errors;
1293
stats->rx_over_errors += dev_stats.rx_over_errors;
1294
stats->rx_crc_errors += dev_stats.rx_crc_errors;
1295
stats->rx_frame_errors += dev_stats.rx_frame_errors;
1296
stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
1297
stats->rx_missed_errors += dev_stats.rx_missed_errors;
1298
stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
1299
stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
1300
stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
1301
stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
1302
stats->tx_window_errors += dev_stats.tx_window_errors;
1307
/* Retrieves current device stats for 'netdev-tap' netdev or
1308
* netdev-internal. */
1310
netdev_pseudo_get_stats(const struct netdev *netdev_,
1311
struct netdev_stats *stats)
1313
struct netdev_dev_linux *netdev_dev =
1314
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1315
struct netdev_stats dev_stats;
1318
get_stats_via_vport(netdev_, stats);
1320
error = netdev_linux_sys_get_stats(netdev_, &dev_stats);
1322
if (!netdev_dev->have_vport_stats) {
1329
/* If this port is an internal port then the transmit and receive stats
1330
* will appear to be swapped relative to the other ports since we are the
1331
* one sending the data, not a remote computer. For consistency, we swap
1332
* them back here. This does not apply if we are getting stats from the
1333
* vport layer because it always tracks stats from the perspective of the
1335
if (!netdev_dev->have_vport_stats) {
1337
swap_uint64(&stats->rx_packets, &stats->tx_packets);
1338
swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
1339
swap_uint64(&stats->rx_errors, &stats->tx_errors);
1340
swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
1341
stats->rx_length_errors = 0;
1342
stats->rx_over_errors = 0;
1343
stats->rx_crc_errors = 0;
1344
stats->rx_frame_errors = 0;
1345
stats->rx_fifo_errors = 0;
1346
stats->rx_missed_errors = 0;
1347
stats->tx_aborted_errors = 0;
1348
stats->tx_carrier_errors = 0;
1349
stats->tx_fifo_errors = 0;
1350
stats->tx_heartbeat_errors = 0;
1351
stats->tx_window_errors = 0;
1353
stats->rx_dropped += dev_stats.tx_dropped;
1354
stats->tx_dropped += dev_stats.rx_dropped;
1356
stats->rx_errors += dev_stats.tx_errors;
1357
stats->tx_errors += dev_stats.rx_errors;
1359
stats->multicast += dev_stats.multicast;
1360
stats->collisions += dev_stats.collisions;
1365
/* Stores the features supported by 'netdev' into each of '*current',
1366
* '*advertised', '*supported', and '*peer' that are non-null. Each value is a
1367
* bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
1368
* successful, otherwise a positive errno value. */
1370
netdev_linux_get_features(const struct netdev *netdev,
1371
uint32_t *current, uint32_t *advertised,
1372
uint32_t *supported, uint32_t *peer)
1374
struct ethtool_cmd ecmd;
1377
memset(&ecmd, 0, sizeof ecmd);
1378
error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1379
ETHTOOL_GSET, "ETHTOOL_GSET");
1384
/* Supported features. */
1386
if (ecmd.supported & SUPPORTED_10baseT_Half) {
1387
*supported |= OFPPF_10MB_HD;
1389
if (ecmd.supported & SUPPORTED_10baseT_Full) {
1390
*supported |= OFPPF_10MB_FD;
1392
if (ecmd.supported & SUPPORTED_100baseT_Half) {
1393
*supported |= OFPPF_100MB_HD;
1395
if (ecmd.supported & SUPPORTED_100baseT_Full) {
1396
*supported |= OFPPF_100MB_FD;
1398
if (ecmd.supported & SUPPORTED_1000baseT_Half) {
1399
*supported |= OFPPF_1GB_HD;
1401
if (ecmd.supported & SUPPORTED_1000baseT_Full) {
1402
*supported |= OFPPF_1GB_FD;
1404
if (ecmd.supported & SUPPORTED_10000baseT_Full) {
1405
*supported |= OFPPF_10GB_FD;
1407
if (ecmd.supported & SUPPORTED_TP) {
1408
*supported |= OFPPF_COPPER;
1410
if (ecmd.supported & SUPPORTED_FIBRE) {
1411
*supported |= OFPPF_FIBER;
1413
if (ecmd.supported & SUPPORTED_Autoneg) {
1414
*supported |= OFPPF_AUTONEG;
1416
if (ecmd.supported & SUPPORTED_Pause) {
1417
*supported |= OFPPF_PAUSE;
1419
if (ecmd.supported & SUPPORTED_Asym_Pause) {
1420
*supported |= OFPPF_PAUSE_ASYM;
1423
/* Advertised features. */
1425
if (ecmd.advertising & ADVERTISED_10baseT_Half) {
1426
*advertised |= OFPPF_10MB_HD;
1428
if (ecmd.advertising & ADVERTISED_10baseT_Full) {
1429
*advertised |= OFPPF_10MB_FD;
1431
if (ecmd.advertising & ADVERTISED_100baseT_Half) {
1432
*advertised |= OFPPF_100MB_HD;
1434
if (ecmd.advertising & ADVERTISED_100baseT_Full) {
1435
*advertised |= OFPPF_100MB_FD;
1437
if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
1438
*advertised |= OFPPF_1GB_HD;
1440
if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
1441
*advertised |= OFPPF_1GB_FD;
1443
if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
1444
*advertised |= OFPPF_10GB_FD;
1446
if (ecmd.advertising & ADVERTISED_TP) {
1447
*advertised |= OFPPF_COPPER;
1449
if (ecmd.advertising & ADVERTISED_FIBRE) {
1450
*advertised |= OFPPF_FIBER;
1452
if (ecmd.advertising & ADVERTISED_Autoneg) {
1453
*advertised |= OFPPF_AUTONEG;
1455
if (ecmd.advertising & ADVERTISED_Pause) {
1456
*advertised |= OFPPF_PAUSE;
1458
if (ecmd.advertising & ADVERTISED_Asym_Pause) {
1459
*advertised |= OFPPF_PAUSE_ASYM;
1462
/* Current settings. */
1463
if (ecmd.speed == SPEED_10) {
1464
*current = ecmd.duplex ? OFPPF_10MB_FD : OFPPF_10MB_HD;
1465
} else if (ecmd.speed == SPEED_100) {
1466
*current = ecmd.duplex ? OFPPF_100MB_FD : OFPPF_100MB_HD;
1467
} else if (ecmd.speed == SPEED_1000) {
1468
*current = ecmd.duplex ? OFPPF_1GB_FD : OFPPF_1GB_HD;
1469
} else if (ecmd.speed == SPEED_10000) {
1470
*current = OFPPF_10GB_FD;
1475
if (ecmd.port == PORT_TP) {
1476
*current |= OFPPF_COPPER;
1477
} else if (ecmd.port == PORT_FIBRE) {
1478
*current |= OFPPF_FIBER;
1482
*current |= OFPPF_AUTONEG;
1485
/* Peer advertisements. */
1486
*peer = 0; /* XXX */
1491
/* Set the features advertised by 'netdev' to 'advertise'. */
1493
netdev_linux_set_advertisements(struct netdev *netdev, uint32_t advertise)
1495
struct ethtool_cmd ecmd;
1498
memset(&ecmd, 0, sizeof ecmd);
1499
error = netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1500
ETHTOOL_GSET, "ETHTOOL_GSET");
1505
ecmd.advertising = 0;
1506
if (advertise & OFPPF_10MB_HD) {
1507
ecmd.advertising |= ADVERTISED_10baseT_Half;
1509
if (advertise & OFPPF_10MB_FD) {
1510
ecmd.advertising |= ADVERTISED_10baseT_Full;
1512
if (advertise & OFPPF_100MB_HD) {
1513
ecmd.advertising |= ADVERTISED_100baseT_Half;
1515
if (advertise & OFPPF_100MB_FD) {
1516
ecmd.advertising |= ADVERTISED_100baseT_Full;
1518
if (advertise & OFPPF_1GB_HD) {
1519
ecmd.advertising |= ADVERTISED_1000baseT_Half;
1521
if (advertise & OFPPF_1GB_FD) {
1522
ecmd.advertising |= ADVERTISED_1000baseT_Full;
1524
if (advertise & OFPPF_10GB_FD) {
1525
ecmd.advertising |= ADVERTISED_10000baseT_Full;
1527
if (advertise & OFPPF_COPPER) {
1528
ecmd.advertising |= ADVERTISED_TP;
1530
if (advertise & OFPPF_FIBER) {
1531
ecmd.advertising |= ADVERTISED_FIBRE;
1533
if (advertise & OFPPF_AUTONEG) {
1534
ecmd.advertising |= ADVERTISED_Autoneg;
1536
if (advertise & OFPPF_PAUSE) {
1537
ecmd.advertising |= ADVERTISED_Pause;
1539
if (advertise & OFPPF_PAUSE_ASYM) {
1540
ecmd.advertising |= ADVERTISED_Asym_Pause;
1542
return netdev_linux_do_ethtool(netdev_get_name(netdev), &ecmd,
1543
ETHTOOL_SSET, "ETHTOOL_SSET");
1546
#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
1547
#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
1549
/* Remove ingress policing from 'netdev'. Returns 0 if successful, otherwise a
1550
* positive errno value.
1552
* This function is equivalent to running
1553
* /sbin/tc qdisc del dev %s handle ffff: ingress
1554
* but it is much, much faster.
1557
netdev_linux_remove_policing(struct netdev *netdev)
1559
struct netdev_dev_linux *netdev_dev =
1560
netdev_dev_linux_cast(netdev_get_dev(netdev));
1561
const char *netdev_name = netdev_get_name(netdev);
1563
struct ofpbuf request;
1564
struct tcmsg *tcmsg;
1567
tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
1571
tcmsg->tcm_handle = tc_make_handle(0xffff, 0);
1572
tcmsg->tcm_parent = TC_H_INGRESS;
1573
nl_msg_put_string(&request, TCA_KIND, "ingress");
1574
nl_msg_put_unspec(&request, TCA_OPTIONS, NULL, 0);
1576
error = tc_transact(&request, NULL);
1577
if (error && error != ENOENT && error != EINVAL) {
1578
VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
1579
netdev_name, strerror(error));
1583
netdev_dev->kbits_rate = 0;
1584
netdev_dev->kbits_burst = 0;
1585
netdev_dev->cache_valid |= VALID_POLICING;
1589
/* Attempts to set input rate limiting (policing) policy. */
1591
netdev_linux_set_policing(struct netdev *netdev,
1592
uint32_t kbits_rate, uint32_t kbits_burst)
1594
struct netdev_dev_linux *netdev_dev =
1595
netdev_dev_linux_cast(netdev_get_dev(netdev));
1596
const char *netdev_name = netdev_get_name(netdev);
1599
COVERAGE_INC(netdev_set_policing);
1601
kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
1602
: !kbits_burst ? 1000 /* Default to 1000 kbits if 0. */
1603
: kbits_burst); /* Stick with user-specified value. */
1605
if (netdev_dev->cache_valid & VALID_POLICING
1606
&& netdev_dev->kbits_rate == kbits_rate
1607
&& netdev_dev->kbits_burst == kbits_burst) {
1608
/* Assume that settings haven't changed since we last set them. */
1612
netdev_linux_remove_policing(netdev);
1614
snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
1615
if (system(command) != 0) {
1616
VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
1620
snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
1621
kbits_rate, kbits_burst);
1622
if (system(command) != 0) {
1623
VLOG_WARN_RL(&rl, "%s: problem configuring policing",
1628
netdev_dev->kbits_rate = kbits_rate;
1629
netdev_dev->kbits_burst = kbits_burst;
1630
netdev_dev->cache_valid |= VALID_POLICING;
1637
netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
1640
const struct tc_ops **opsp;
1642
for (opsp = tcs; *opsp != NULL; opsp++) {
1643
const struct tc_ops *ops = *opsp;
1644
if (ops->tc_install && ops->ovs_name[0] != '\0') {
1645
sset_add(types, ops->ovs_name);
1651
static const struct tc_ops *
1652
tc_lookup_ovs_name(const char *name)
1654
const struct tc_ops **opsp;
1656
for (opsp = tcs; *opsp != NULL; opsp++) {
1657
const struct tc_ops *ops = *opsp;
1658
if (!strcmp(name, ops->ovs_name)) {
1665
static const struct tc_ops *
1666
tc_lookup_linux_name(const char *name)
1668
const struct tc_ops **opsp;
1670
for (opsp = tcs; *opsp != NULL; opsp++) {
1671
const struct tc_ops *ops = *opsp;
1672
if (ops->linux_name && !strcmp(name, ops->linux_name)) {
1679
static struct tc_queue *
1680
tc_find_queue__(const struct netdev *netdev, unsigned int queue_id,
1683
struct netdev_dev_linux *netdev_dev =
1684
netdev_dev_linux_cast(netdev_get_dev(netdev));
1685
struct tc_queue *queue;
1687
HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev_dev->tc->queues) {
1688
if (queue->queue_id == queue_id) {
1695
static struct tc_queue *
1696
tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
1698
return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
1702
netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
1704
struct netdev_qos_capabilities *caps)
1706
const struct tc_ops *ops = tc_lookup_ovs_name(type);
1710
caps->n_queues = ops->n_queues;
1715
netdev_linux_get_qos(const struct netdev *netdev,
1716
const char **typep, struct shash *details)
1718
struct netdev_dev_linux *netdev_dev =
1719
netdev_dev_linux_cast(netdev_get_dev(netdev));
1722
error = tc_query_qdisc(netdev);
1727
*typep = netdev_dev->tc->ops->ovs_name;
1728
return (netdev_dev->tc->ops->qdisc_get
1729
? netdev_dev->tc->ops->qdisc_get(netdev, details)
1734
netdev_linux_set_qos(struct netdev *netdev,
1735
const char *type, const struct shash *details)
1737
struct netdev_dev_linux *netdev_dev =
1738
netdev_dev_linux_cast(netdev_get_dev(netdev));
1739
const struct tc_ops *new_ops;
1742
new_ops = tc_lookup_ovs_name(type);
1743
if (!new_ops || !new_ops->tc_install) {
1747
error = tc_query_qdisc(netdev);
1752
if (new_ops == netdev_dev->tc->ops) {
1753
return new_ops->qdisc_set ? new_ops->qdisc_set(netdev, details) : 0;
1755
/* Delete existing qdisc. */
1756
error = tc_del_qdisc(netdev);
1760
assert(netdev_dev->tc == NULL);
1762
/* Install new qdisc. */
1763
error = new_ops->tc_install(netdev, details);
1764
assert((error == 0) == (netdev_dev->tc != NULL));
1771
netdev_linux_get_queue(const struct netdev *netdev,
1772
unsigned int queue_id, struct shash *details)
1774
struct netdev_dev_linux *netdev_dev =
1775
netdev_dev_linux_cast(netdev_get_dev(netdev));
1778
error = tc_query_qdisc(netdev);
1782
struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1784
? netdev_dev->tc->ops->class_get(netdev, queue, details)
1790
netdev_linux_set_queue(struct netdev *netdev,
1791
unsigned int queue_id, const struct shash *details)
1793
struct netdev_dev_linux *netdev_dev =
1794
netdev_dev_linux_cast(netdev_get_dev(netdev));
1797
error = tc_query_qdisc(netdev);
1800
} else if (queue_id >= netdev_dev->tc->ops->n_queues
1801
|| !netdev_dev->tc->ops->class_set) {
1805
return netdev_dev->tc->ops->class_set(netdev, queue_id, details);
1809
netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
1811
struct netdev_dev_linux *netdev_dev =
1812
netdev_dev_linux_cast(netdev_get_dev(netdev));
1815
error = tc_query_qdisc(netdev);
1818
} else if (!netdev_dev->tc->ops->class_delete) {
1821
struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1823
? netdev_dev->tc->ops->class_delete(netdev, queue)
1829
netdev_linux_get_queue_stats(const struct netdev *netdev,
1830
unsigned int queue_id,
1831
struct netdev_queue_stats *stats)
1833
struct netdev_dev_linux *netdev_dev =
1834
netdev_dev_linux_cast(netdev_get_dev(netdev));
1837
error = tc_query_qdisc(netdev);
1840
} else if (!netdev_dev->tc->ops->class_get_stats) {
1843
const struct tc_queue *queue = tc_find_queue(netdev, queue_id);
1845
? netdev_dev->tc->ops->class_get_stats(netdev, queue, stats)
1851
start_queue_dump(const struct netdev *netdev, struct nl_dump *dump)
1853
struct ofpbuf request;
1854
struct tcmsg *tcmsg;
1856
tcmsg = tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
1860
tcmsg->tcm_parent = 0;
1861
nl_dump_start(dump, rtnl_sock, &request);
1862
ofpbuf_uninit(&request);
1867
netdev_linux_dump_queues(const struct netdev *netdev,
1868
netdev_dump_queues_cb *cb, void *aux)
1870
struct netdev_dev_linux *netdev_dev =
1871
netdev_dev_linux_cast(netdev_get_dev(netdev));
1872
struct tc_queue *queue, *next_queue;
1873
struct shash details;
1877
error = tc_query_qdisc(netdev);
1880
} else if (!netdev_dev->tc->ops->class_get) {
1885
shash_init(&details);
1886
HMAP_FOR_EACH_SAFE (queue, next_queue, hmap_node,
1887
&netdev_dev->tc->queues) {
1888
shash_clear(&details);
1890
error = netdev_dev->tc->ops->class_get(netdev, queue, &details);
1892
(*cb)(queue->queue_id, &details, aux);
1897
shash_destroy(&details);
1903
netdev_linux_dump_queue_stats(const struct netdev *netdev,
1904
netdev_dump_queue_stats_cb *cb, void *aux)
1906
struct netdev_dev_linux *netdev_dev =
1907
netdev_dev_linux_cast(netdev_get_dev(netdev));
1908
struct nl_dump dump;
1913
error = tc_query_qdisc(netdev);
1916
} else if (!netdev_dev->tc->ops->class_dump_stats) {
1921
if (!start_queue_dump(netdev, &dump)) {
1924
while (nl_dump_next(&dump, &msg)) {
1925
error = netdev_dev->tc->ops->class_dump_stats(netdev, &msg, cb, aux);
1931
error = nl_dump_done(&dump);
1932
return error ? error : last_error;
1936
netdev_linux_get_in4(const struct netdev *netdev_,
1937
struct in_addr *address, struct in_addr *netmask)
1939
struct netdev_dev_linux *netdev_dev =
1940
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1942
if (!(netdev_dev->cache_valid & VALID_IN4)) {
1945
error = netdev_linux_get_ipv4(netdev_, &netdev_dev->address,
1946
SIOCGIFADDR, "SIOCGIFADDR");
1951
error = netdev_linux_get_ipv4(netdev_, &netdev_dev->netmask,
1952
SIOCGIFNETMASK, "SIOCGIFNETMASK");
1957
netdev_dev->cache_valid |= VALID_IN4;
1959
*address = netdev_dev->address;
1960
*netmask = netdev_dev->netmask;
1961
return address->s_addr == INADDR_ANY ? EADDRNOTAVAIL : 0;
1965
netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
1966
struct in_addr netmask)
1968
struct netdev_dev_linux *netdev_dev =
1969
netdev_dev_linux_cast(netdev_get_dev(netdev_));
1972
error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
1974
netdev_dev->cache_valid |= VALID_IN4;
1975
netdev_dev->address = address;
1976
netdev_dev->netmask = netmask;
1977
if (address.s_addr != INADDR_ANY) {
1978
error = do_set_addr(netdev_, SIOCSIFNETMASK,
1979
"SIOCSIFNETMASK", netmask);
1986
parse_if_inet6_line(const char *line,
1987
struct in6_addr *in6, char ifname[16 + 1])
1989
uint8_t *s6 = in6->s6_addr;
1990
#define X8 "%2"SCNx8
1992
" "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
1993
"%*x %*x %*x %*x %16s\n",
1994
&s6[0], &s6[1], &s6[2], &s6[3],
1995
&s6[4], &s6[5], &s6[6], &s6[7],
1996
&s6[8], &s6[9], &s6[10], &s6[11],
1997
&s6[12], &s6[13], &s6[14], &s6[15],
2001
/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
2002
* 'in6' is non-null) and returns true. Otherwise, returns false. */
2004
netdev_linux_get_in6(const struct netdev *netdev_, struct in6_addr *in6)
2006
struct netdev_dev_linux *netdev_dev =
2007
netdev_dev_linux_cast(netdev_get_dev(netdev_));
2008
if (!(netdev_dev->cache_valid & VALID_IN6)) {
2012
netdev_dev->in6 = in6addr_any;
2014
file = fopen("/proc/net/if_inet6", "r");
2016
const char *name = netdev_get_name(netdev_);
2017
while (fgets(line, sizeof line, file)) {
2018
struct in6_addr in6_tmp;
2019
char ifname[16 + 1];
2020
if (parse_if_inet6_line(line, &in6_tmp, ifname)
2021
&& !strcmp(name, ifname))
2023
netdev_dev->in6 = in6_tmp;
2029
netdev_dev->cache_valid |= VALID_IN6;
2031
*in6 = netdev_dev->in6;
2036
make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
2038
struct sockaddr_in sin;
2039
memset(&sin, 0, sizeof sin);
2040
sin.sin_family = AF_INET;
2041
sin.sin_addr = addr;
2044
memset(sa, 0, sizeof *sa);
2045
memcpy(sa, &sin, sizeof sin);
2049
do_set_addr(struct netdev *netdev,
2050
int ioctl_nr, const char *ioctl_name, struct in_addr addr)
2053
ovs_strzcpy(ifr.ifr_name, netdev_get_name(netdev), sizeof ifr.ifr_name);
2054
make_in4_sockaddr(&ifr.ifr_addr, addr);
2056
return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
2060
/* Adds 'router' as a default IP gateway. */
2062
netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
2064
struct in_addr any = { INADDR_ANY };
2068
memset(&rt, 0, sizeof rt);
2069
make_in4_sockaddr(&rt.rt_dst, any);
2070
make_in4_sockaddr(&rt.rt_gateway, router);
2071
make_in4_sockaddr(&rt.rt_genmask, any);
2072
rt.rt_flags = RTF_UP | RTF_GATEWAY;
2073
error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
2075
VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
2081
netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
2084
static const char fn[] = "/proc/net/route";
2089
*netdev_name = NULL;
2090
stream = fopen(fn, "r");
2091
if (stream == NULL) {
2092
VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
2097
while (fgets(line, sizeof line, stream)) {
2100
ovs_be32 dest, gateway, mask;
2101
int refcnt, metric, mtu;
2102
unsigned int flags, use, window, irtt;
2105
"%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
2107
iface, &dest, &gateway, &flags, &refcnt,
2108
&use, &metric, &mask, &mtu, &window, &irtt) != 11) {
2110
VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
2114
if (!(flags & RTF_UP)) {
2115
/* Skip routes that aren't up. */
2119
/* The output of 'dest', 'mask', and 'gateway' were given in
2120
* network byte order, so we don't need need any endian
2121
* conversions here. */
2122
if ((dest & mask) == (host->s_addr & mask)) {
2124
/* The host is directly reachable. */
2125
next_hop->s_addr = 0;
2127
/* To reach the host, we must go through a gateway. */
2128
next_hop->s_addr = gateway;
2130
*netdev_name = xstrdup(iface);
2142
netdev_linux_get_status(const struct netdev *netdev, struct shash *sh)
2144
struct ethtool_drvinfo drvinfo;
2147
memset(&drvinfo, 0, sizeof drvinfo);
2148
error = netdev_linux_do_ethtool(netdev_get_name(netdev),
2149
(struct ethtool_cmd *)&drvinfo,
2151
"ETHTOOL_GDRVINFO");
2153
shash_add(sh, "driver_name", xstrdup(drvinfo.driver));
2154
shash_add(sh, "driver_version", xstrdup(drvinfo.version));
2155
shash_add(sh, "firmware_version", xstrdup(drvinfo.fw_version));
2161
/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
2162
* successfully retrieved, it stores the corresponding MAC address in 'mac' and
2163
* returns 0. Otherwise, it returns a positive errno value; in particular,
2164
* ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
2166
netdev_linux_arp_lookup(const struct netdev *netdev,
2167
ovs_be32 ip, uint8_t mac[ETH_ADDR_LEN])
2170
struct sockaddr_in sin;
2173
memset(&r, 0, sizeof r);
2174
memset(&sin, 0, sizeof sin);
2175
sin.sin_family = AF_INET;
2176
sin.sin_addr.s_addr = ip;
2178
memcpy(&r.arp_pa, &sin, sizeof sin);
2179
r.arp_ha.sa_family = ARPHRD_ETHER;
2181
ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
2182
COVERAGE_INC(netdev_arp_lookup);
2183
retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
2185
memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
2186
} else if (retval != ENXIO) {
2187
VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
2188
netdev_get_name(netdev), IP_ARGS(&ip), strerror(retval));
2194
nd_to_iff_flags(enum netdev_flags nd)
2197
if (nd & NETDEV_UP) {
2200
if (nd & NETDEV_PROMISC) {
2207
iff_to_nd_flags(int iff)
2209
enum netdev_flags nd = 0;
2213
if (iff & IFF_PROMISC) {
2214
nd |= NETDEV_PROMISC;
2220
netdev_linux_update_flags(struct netdev *netdev, enum netdev_flags off,
2221
enum netdev_flags on, enum netdev_flags *old_flagsp)
2223
int old_flags, new_flags;
2226
error = get_flags(netdev, &old_flags);
2228
*old_flagsp = iff_to_nd_flags(old_flags);
2229
new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
2230
if (new_flags != old_flags) {
2231
error = set_flags(netdev, new_flags);
2238
netdev_linux_change_seq(const struct netdev *netdev)
2240
return netdev_dev_linux_cast(netdev_get_dev(netdev))->change_seq;
2243
#define NETDEV_LINUX_CLASS(NAME, CREATE, GET_STATS, SET_STATS) \
2247
netdev_linux_init, \
2249
netdev_linux_wait, \
2252
netdev_linux_destroy, \
2253
NULL, /* get_config */ \
2254
NULL, /* set_config */ \
2256
netdev_linux_open, \
2257
netdev_linux_close, \
2259
netdev_linux_listen, \
2260
netdev_linux_recv, \
2261
netdev_linux_recv_wait, \
2262
netdev_linux_drain, \
2264
netdev_linux_send, \
2265
netdev_linux_send_wait, \
2267
netdev_linux_set_etheraddr, \
2268
netdev_linux_get_etheraddr, \
2269
netdev_linux_get_mtu, \
2270
netdev_linux_set_mtu, \
2271
netdev_linux_get_ifindex, \
2272
netdev_linux_get_carrier, \
2273
netdev_linux_get_carrier_resets, \
2274
netdev_linux_set_miimon_interval, \
2278
netdev_linux_get_features, \
2279
netdev_linux_set_advertisements, \
2281
netdev_linux_set_policing, \
2282
netdev_linux_get_qos_types, \
2283
netdev_linux_get_qos_capabilities, \
2284
netdev_linux_get_qos, \
2285
netdev_linux_set_qos, \
2286
netdev_linux_get_queue, \
2287
netdev_linux_set_queue, \
2288
netdev_linux_delete_queue, \
2289
netdev_linux_get_queue_stats, \
2290
netdev_linux_dump_queues, \
2291
netdev_linux_dump_queue_stats, \
2293
netdev_linux_get_in4, \
2294
netdev_linux_set_in4, \
2295
netdev_linux_get_in6, \
2296
netdev_linux_add_router, \
2297
netdev_linux_get_next_hop, \
2298
netdev_linux_get_status, \
2299
netdev_linux_arp_lookup, \
2301
netdev_linux_update_flags, \
2303
netdev_linux_change_seq \
2306
const struct netdev_class netdev_linux_class =
2309
netdev_linux_create,
2310
netdev_linux_get_stats,
2311
NULL); /* set_stats */
2313
const struct netdev_class netdev_tap_class =
2316
netdev_linux_create_tap,
2317
netdev_pseudo_get_stats,
2318
NULL); /* set_stats */
2320
const struct netdev_class netdev_internal_class =
2323
netdev_linux_create,
2324
netdev_pseudo_get_stats,
2325
netdev_vport_set_stats);
2327
/* HTB traffic control class. */
2329
#define HTB_N_QUEUES 0xf000
2333
unsigned int max_rate; /* In bytes/s. */
2337
struct tc_queue tc_queue;
2338
unsigned int min_rate; /* In bytes/s. */
2339
unsigned int max_rate; /* In bytes/s. */
2340
unsigned int burst; /* In bytes. */
2341
unsigned int priority; /* Lower values are higher priorities. */
2345
htb_get__(const struct netdev *netdev)
2347
struct netdev_dev_linux *netdev_dev =
2348
netdev_dev_linux_cast(netdev_get_dev(netdev));
2349
return CONTAINER_OF(netdev_dev->tc, struct htb, tc);
2353
htb_install__(struct netdev *netdev, uint64_t max_rate)
2355
struct netdev_dev_linux *netdev_dev =
2356
netdev_dev_linux_cast(netdev_get_dev(netdev));
2359
htb = xmalloc(sizeof *htb);
2360
tc_init(&htb->tc, &tc_ops_htb);
2361
htb->max_rate = max_rate;
2363
netdev_dev->tc = &htb->tc;
2366
/* Create an HTB qdisc.
2368
* Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
2370
htb_setup_qdisc__(struct netdev *netdev)
2373
struct tc_htb_glob opt;
2374
struct ofpbuf request;
2375
struct tcmsg *tcmsg;
2377
tc_del_qdisc(netdev);
2379
tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
2380
NLM_F_EXCL | NLM_F_CREATE, &request);
2384
tcmsg->tcm_handle = tc_make_handle(1, 0);
2385
tcmsg->tcm_parent = TC_H_ROOT;
2387
nl_msg_put_string(&request, TCA_KIND, "htb");
2389
memset(&opt, 0, sizeof opt);
2390
opt.rate2quantum = 10;
2394
opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2395
nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
2396
nl_msg_end_nested(&request, opt_offset);
2398
return tc_transact(&request, NULL);
2401
/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
2402
* rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
2404
htb_setup_class__(struct netdev *netdev, unsigned int handle,
2405
unsigned int parent, struct htb_class *class)
2408
struct tc_htb_opt opt;
2409
struct ofpbuf request;
2410
struct tcmsg *tcmsg;
2414
error = netdev_get_mtu(netdev, &mtu);
2416
VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
2417
netdev_get_name(netdev));
2421
memset(&opt, 0, sizeof opt);
2422
tc_fill_rate(&opt.rate, class->min_rate, mtu);
2423
tc_fill_rate(&opt.ceil, class->max_rate, mtu);
2424
opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
2425
opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
2426
opt.prio = class->priority;
2428
tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
2432
tcmsg->tcm_handle = handle;
2433
tcmsg->tcm_parent = parent;
2435
nl_msg_put_string(&request, TCA_KIND, "htb");
2436
opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2437
nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
2438
tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
2439
tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
2440
nl_msg_end_nested(&request, opt_offset);
2442
error = tc_transact(&request, NULL);
2444
VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
2445
"min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
2446
netdev_get_name(netdev),
2447
tc_get_major(handle), tc_get_minor(handle),
2448
tc_get_major(parent), tc_get_minor(parent),
2449
class->min_rate, class->max_rate,
2450
class->burst, class->priority, strerror(error));
2455
/* Parses Netlink attributes in 'options' for HTB parameters and stores a
2456
* description of them into 'details'. The description complies with the
2457
* specification given in the vswitch database documentation for linux-htb
2460
htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
2462
static const struct nl_policy tca_htb_policy[] = {
2463
[TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
2464
.min_len = sizeof(struct tc_htb_opt) },
2467
struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
2468
const struct tc_htb_opt *htb;
2470
if (!nl_parse_nested(nl_options, tca_htb_policy,
2471
attrs, ARRAY_SIZE(tca_htb_policy))) {
2472
VLOG_WARN_RL(&rl, "failed to parse HTB class options");
2476
htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
2477
class->min_rate = htb->rate.rate;
2478
class->max_rate = htb->ceil.rate;
2479
class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
2480
class->priority = htb->prio;
2485
htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2486
struct htb_class *options,
2487
struct netdev_queue_stats *stats)
2489
struct nlattr *nl_options;
2490
unsigned int handle;
2493
error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2494
if (!error && queue_id) {
2495
unsigned int major = tc_get_major(handle);
2496
unsigned int minor = tc_get_minor(handle);
2497
if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2498
*queue_id = minor - 1;
2503
if (!error && options) {
2504
error = htb_parse_tca_options__(nl_options, options);
2510
htb_parse_qdisc_details__(struct netdev *netdev,
2511
const struct shash *details, struct htb_class *hc)
2513
const char *max_rate_s;
2515
max_rate_s = shash_find_data(details, "max-rate");
2516
hc->max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2517
if (!hc->max_rate) {
2520
netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
2521
hc->max_rate = netdev_features_to_bps(current) / 8;
2523
hc->min_rate = hc->max_rate;
2529
htb_parse_class_details__(struct netdev *netdev,
2530
const struct shash *details, struct htb_class *hc)
2532
const struct htb *htb = htb_get__(netdev);
2533
const char *min_rate_s = shash_find_data(details, "min-rate");
2534
const char *max_rate_s = shash_find_data(details, "max-rate");
2535
const char *burst_s = shash_find_data(details, "burst");
2536
const char *priority_s = shash_find_data(details, "priority");
2539
error = netdev_get_mtu(netdev, &mtu);
2541
VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
2542
netdev_get_name(netdev));
2546
/* HTB requires at least an mtu sized min-rate to send any traffic even
2547
* on uncongested links. */
2548
hc->min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
2549
hc->min_rate = MAX(hc->min_rate, mtu);
2550
hc->min_rate = MIN(hc->min_rate, htb->max_rate);
2553
hc->max_rate = (max_rate_s
2554
? strtoull(max_rate_s, NULL, 10) / 8
2556
hc->max_rate = MAX(hc->max_rate, hc->min_rate);
2557
hc->max_rate = MIN(hc->max_rate, htb->max_rate);
2561
* According to hints in the documentation that I've read, it is important
2562
* that 'burst' be at least as big as the largest frame that might be
2563
* transmitted. Also, making 'burst' a bit bigger than necessary is OK,
2564
* but having it a bit too small is a problem. Since netdev_get_mtu()
2565
* doesn't include the Ethernet header, we need to add at least 14 (18?) to
2566
* the MTU. We actually add 64, instead of 14, as a guard against
2567
* additional headers get tacked on somewhere that we're not aware of. */
2568
hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
2569
hc->burst = MAX(hc->burst, mtu + 64);
2572
hc->priority = priority_s ? strtoul(priority_s, NULL, 10) : 0;
2578
htb_query_class__(const struct netdev *netdev, unsigned int handle,
2579
unsigned int parent, struct htb_class *options,
2580
struct netdev_queue_stats *stats)
2582
struct ofpbuf *reply;
2585
error = tc_query_class(netdev, handle, parent, &reply);
2587
error = htb_parse_tcmsg__(reply, NULL, options, stats);
2588
ofpbuf_delete(reply);
2594
htb_tc_install(struct netdev *netdev, const struct shash *details)
2598
error = htb_setup_qdisc__(netdev);
2600
struct htb_class hc;
2602
htb_parse_qdisc_details__(netdev, details, &hc);
2603
error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2604
tc_make_handle(1, 0), &hc);
2606
htb_install__(netdev, hc.max_rate);
2612
static struct htb_class *
2613
htb_class_cast__(const struct tc_queue *queue)
2615
return CONTAINER_OF(queue, struct htb_class, tc_queue);
2619
htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
2620
const struct htb_class *hc)
2622
struct htb *htb = htb_get__(netdev);
2623
size_t hash = hash_int(queue_id, 0);
2624
struct tc_queue *queue;
2625
struct htb_class *hcp;
2627
queue = tc_find_queue__(netdev, queue_id, hash);
2629
hcp = htb_class_cast__(queue);
2631
hcp = xmalloc(sizeof *hcp);
2632
queue = &hcp->tc_queue;
2633
queue->queue_id = queue_id;
2634
hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
2637
hcp->min_rate = hc->min_rate;
2638
hcp->max_rate = hc->max_rate;
2639
hcp->burst = hc->burst;
2640
hcp->priority = hc->priority;
2644
htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
2647
struct nl_dump dump;
2648
struct htb_class hc;
2650
/* Get qdisc options. */
2652
htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
2653
htb_install__(netdev, hc.max_rate);
2656
if (!start_queue_dump(netdev, &dump)) {
2659
while (nl_dump_next(&dump, &msg)) {
2660
unsigned int queue_id;
2662
if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
2663
htb_update_queue__(netdev, queue_id, &hc);
2666
nl_dump_done(&dump);
2672
htb_tc_destroy(struct tc *tc)
2674
struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
2675
struct htb_class *hc, *next;
2677
HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &htb->tc.queues) {
2678
hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2686
htb_qdisc_get(const struct netdev *netdev, struct shash *details)
2688
const struct htb *htb = htb_get__(netdev);
2689
shash_add(details, "max-rate", xasprintf("%llu", 8ULL * htb->max_rate));
2694
htb_qdisc_set(struct netdev *netdev, const struct shash *details)
2696
struct htb_class hc;
2699
htb_parse_qdisc_details__(netdev, details, &hc);
2700
error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
2701
tc_make_handle(1, 0), &hc);
2703
htb_get__(netdev)->max_rate = hc.max_rate;
2709
htb_class_get(const struct netdev *netdev OVS_UNUSED,
2710
const struct tc_queue *queue, struct shash *details)
2712
const struct htb_class *hc = htb_class_cast__(queue);
2714
shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
2715
if (hc->min_rate != hc->max_rate) {
2716
shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
2718
shash_add(details, "burst", xasprintf("%llu", 8ULL * hc->burst));
2720
shash_add(details, "priority", xasprintf("%u", hc->priority));
2726
htb_class_set(struct netdev *netdev, unsigned int queue_id,
2727
const struct shash *details)
2729
struct htb_class hc;
2732
error = htb_parse_class_details__(netdev, details, &hc);
2737
error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
2738
tc_make_handle(1, 0xfffe), &hc);
2743
htb_update_queue__(netdev, queue_id, &hc);
2748
htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
2750
struct htb_class *hc = htb_class_cast__(queue);
2751
struct htb *htb = htb_get__(netdev);
2754
error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
2756
hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
2763
htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
2764
struct netdev_queue_stats *stats)
2766
return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
2767
tc_make_handle(1, 0xfffe), NULL, stats);
2771
htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
2772
const struct ofpbuf *nlmsg,
2773
netdev_dump_queue_stats_cb *cb, void *aux)
2775
struct netdev_queue_stats stats;
2776
unsigned int handle, major, minor;
2779
error = tc_parse_class(nlmsg, &handle, NULL, &stats);
2784
major = tc_get_major(handle);
2785
minor = tc_get_minor(handle);
2786
if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
2787
(*cb)(minor - 1, &stats, aux);
2792
static const struct tc_ops tc_ops_htb = {
2793
"htb", /* linux_name */
2794
"linux-htb", /* ovs_name */
2795
HTB_N_QUEUES, /* n_queues */
2804
htb_class_get_stats,
2805
htb_class_dump_stats
2808
/* "linux-hfsc" traffic control class. */
2810
#define HFSC_N_QUEUES 0xf000
2818
struct tc_queue tc_queue;
2823
static struct hfsc *
2824
hfsc_get__(const struct netdev *netdev)
2826
struct netdev_dev_linux *netdev_dev;
2827
netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2828
return CONTAINER_OF(netdev_dev->tc, struct hfsc, tc);
2831
static struct hfsc_class *
2832
hfsc_class_cast__(const struct tc_queue *queue)
2834
return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
2838
hfsc_install__(struct netdev *netdev, uint32_t max_rate)
2840
struct netdev_dev_linux * netdev_dev;
2843
netdev_dev = netdev_dev_linux_cast(netdev_get_dev(netdev));
2844
hfsc = xmalloc(sizeof *hfsc);
2845
tc_init(&hfsc->tc, &tc_ops_hfsc);
2846
hfsc->max_rate = max_rate;
2847
netdev_dev->tc = &hfsc->tc;
2851
hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
2852
const struct hfsc_class *hc)
2856
struct hfsc_class *hcp;
2857
struct tc_queue *queue;
2859
hfsc = hfsc_get__(netdev);
2860
hash = hash_int(queue_id, 0);
2862
queue = tc_find_queue__(netdev, queue_id, hash);
2864
hcp = hfsc_class_cast__(queue);
2866
hcp = xmalloc(sizeof *hcp);
2867
queue = &hcp->tc_queue;
2868
queue->queue_id = queue_id;
2869
hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
2872
hcp->min_rate = hc->min_rate;
2873
hcp->max_rate = hc->max_rate;
2877
hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
2879
const struct tc_service_curve *rsc, *fsc, *usc;
2880
static const struct nl_policy tca_hfsc_policy[] = {
2882
.type = NL_A_UNSPEC,
2884
.min_len = sizeof(struct tc_service_curve),
2887
.type = NL_A_UNSPEC,
2889
.min_len = sizeof(struct tc_service_curve),
2892
.type = NL_A_UNSPEC,
2894
.min_len = sizeof(struct tc_service_curve),
2897
struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
2899
if (!nl_parse_nested(nl_options, tca_hfsc_policy,
2900
attrs, ARRAY_SIZE(tca_hfsc_policy))) {
2901
VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
2905
rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
2906
fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
2907
usc = nl_attr_get(attrs[TCA_HFSC_USC]);
2909
if (rsc->m1 != 0 || rsc->d != 0 ||
2910
fsc->m1 != 0 || fsc->d != 0 ||
2911
usc->m1 != 0 || usc->d != 0) {
2912
VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2913
"Non-linear service curves are not supported.");
2917
if (rsc->m2 != fsc->m2) {
2918
VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2919
"Real-time service curves are not supported ");
2923
if (rsc->m2 > usc->m2) {
2924
VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
2925
"Min-rate service curve is greater than "
2926
"the max-rate service curve.");
2930
class->min_rate = fsc->m2;
2931
class->max_rate = usc->m2;
2936
hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
2937
struct hfsc_class *options,
2938
struct netdev_queue_stats *stats)
2941
unsigned int handle;
2942
struct nlattr *nl_options;
2944
error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
2950
unsigned int major, minor;
2952
major = tc_get_major(handle);
2953
minor = tc_get_minor(handle);
2954
if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
2955
*queue_id = minor - 1;
2962
error = hfsc_parse_tca_options__(nl_options, options);
2969
hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
2970
unsigned int parent, struct hfsc_class *options,
2971
struct netdev_queue_stats *stats)
2974
struct ofpbuf *reply;
2976
error = tc_query_class(netdev, handle, parent, &reply);
2981
error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
2982
ofpbuf_delete(reply);
2987
hfsc_parse_qdisc_details__(struct netdev *netdev, const struct shash *details,
2988
struct hfsc_class *class)
2991
const char *max_rate_s;
2993
max_rate_s = shash_find_data(details, "max-rate");
2994
max_rate = max_rate_s ? strtoull(max_rate_s, NULL, 10) / 8 : 0;
2999
netdev_get_features(netdev, ¤t, NULL, NULL, NULL);
3000
max_rate = netdev_features_to_bps(current) / 8;
3003
class->min_rate = max_rate;
3004
class->max_rate = max_rate;
3008
hfsc_parse_class_details__(struct netdev *netdev,
3009
const struct shash *details,
3010
struct hfsc_class * class)
3012
const struct hfsc *hfsc;
3013
uint32_t min_rate, max_rate;
3014
const char *min_rate_s, *max_rate_s;
3016
hfsc = hfsc_get__(netdev);
3017
min_rate_s = shash_find_data(details, "min-rate");
3018
max_rate_s = shash_find_data(details, "max-rate");
3020
min_rate = min_rate_s ? strtoull(min_rate_s, NULL, 10) / 8 : 0;
3021
min_rate = MAX(min_rate, 1);
3022
min_rate = MIN(min_rate, hfsc->max_rate);
3024
max_rate = (max_rate_s
3025
? strtoull(max_rate_s, NULL, 10) / 8
3027
max_rate = MAX(max_rate, min_rate);
3028
max_rate = MIN(max_rate, hfsc->max_rate);
3030
class->min_rate = min_rate;
3031
class->max_rate = max_rate;
3036
/* Create an HFSC qdisc.
3038
* Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
3040
hfsc_setup_qdisc__(struct netdev * netdev)
3042
struct tcmsg *tcmsg;
3043
struct ofpbuf request;
3044
struct tc_hfsc_qopt opt;
3046
tc_del_qdisc(netdev);
3048
tcmsg = tc_make_request(netdev, RTM_NEWQDISC,
3049
NLM_F_EXCL | NLM_F_CREATE, &request);
3055
tcmsg->tcm_handle = tc_make_handle(1, 0);
3056
tcmsg->tcm_parent = TC_H_ROOT;
3058
memset(&opt, 0, sizeof opt);
3061
nl_msg_put_string(&request, TCA_KIND, "hfsc");
3062
nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
3064
return tc_transact(&request, NULL);
3067
/* Create an HFSC class.
3069
* Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
3070
* sc rate <min_rate> ul rate <max_rate>" */
3072
hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
3073
unsigned int parent, struct hfsc_class *class)
3077
struct tcmsg *tcmsg;
3078
struct ofpbuf request;
3079
struct tc_service_curve min, max;
3081
tcmsg = tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, &request);
3087
tcmsg->tcm_handle = handle;
3088
tcmsg->tcm_parent = parent;
3092
min.m2 = class->min_rate;
3096
max.m2 = class->max_rate;
3098
nl_msg_put_string(&request, TCA_KIND, "hfsc");
3099
opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3100
nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
3101
nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
3102
nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
3103
nl_msg_end_nested(&request, opt_offset);
3105
error = tc_transact(&request, NULL);
3107
VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
3108
"min-rate %ubps, max-rate %ubps (%s)",
3109
netdev_get_name(netdev),
3110
tc_get_major(handle), tc_get_minor(handle),
3111
tc_get_major(parent), tc_get_minor(parent),
3112
class->min_rate, class->max_rate, strerror(error));
3119
hfsc_tc_install(struct netdev *netdev, const struct shash *details)
3122
struct hfsc_class class;
3124
error = hfsc_setup_qdisc__(netdev);
3130
hfsc_parse_qdisc_details__(netdev, details, &class);
3131
error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3132
tc_make_handle(1, 0), &class);
3138
hfsc_install__(netdev, class.max_rate);
3143
hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3146
struct nl_dump dump;
3147
struct hfsc_class hc;
3150
hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
3151
hfsc_install__(netdev, hc.max_rate);
3153
if (!start_queue_dump(netdev, &dump)) {
3157
while (nl_dump_next(&dump, &msg)) {
3158
unsigned int queue_id;
3160
if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
3161
hfsc_update_queue__(netdev, queue_id, &hc);
3165
nl_dump_done(&dump);
3170
hfsc_tc_destroy(struct tc *tc)
3173
struct hfsc_class *hc, *next;
3175
hfsc = CONTAINER_OF(tc, struct hfsc, tc);
3177
HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
3178
hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3187
hfsc_qdisc_get(const struct netdev *netdev, struct shash *details)
3189
const struct hfsc *hfsc;
3190
hfsc = hfsc_get__(netdev);
3191
shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hfsc->max_rate));
3196
hfsc_qdisc_set(struct netdev *netdev, const struct shash *details)
3199
struct hfsc_class class;
3201
hfsc_parse_qdisc_details__(netdev, details, &class);
3202
error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
3203
tc_make_handle(1, 0), &class);
3206
hfsc_get__(netdev)->max_rate = class.max_rate;
3213
hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
3214
const struct tc_queue *queue, struct shash *details)
3216
const struct hfsc_class *hc;
3218
hc = hfsc_class_cast__(queue);
3219
shash_add(details, "min-rate", xasprintf("%llu", 8ULL * hc->min_rate));
3220
if (hc->min_rate != hc->max_rate) {
3221
shash_add(details, "max-rate", xasprintf("%llu", 8ULL * hc->max_rate));
3227
hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
3228
const struct shash *details)
3231
struct hfsc_class class;
3233
error = hfsc_parse_class_details__(netdev, details, &class);
3238
error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
3239
tc_make_handle(1, 0xfffe), &class);
3244
hfsc_update_queue__(netdev, queue_id, &class);
3249
hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
3253
struct hfsc_class *hc;
3255
hc = hfsc_class_cast__(queue);
3256
hfsc = hfsc_get__(netdev);
3258
error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
3260
hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
3267
hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
3268
struct netdev_queue_stats *stats)
3270
return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
3271
tc_make_handle(1, 0xfffe), NULL, stats);
3275
hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
3276
const struct ofpbuf *nlmsg,
3277
netdev_dump_queue_stats_cb *cb, void *aux)
3279
struct netdev_queue_stats stats;
3280
unsigned int handle, major, minor;
3283
error = tc_parse_class(nlmsg, &handle, NULL, &stats);
3288
major = tc_get_major(handle);
3289
minor = tc_get_minor(handle);
3290
if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
3291
(*cb)(minor - 1, &stats, aux);
3296
static const struct tc_ops tc_ops_hfsc = {
3297
"hfsc", /* linux_name */
3298
"linux-hfsc", /* ovs_name */
3299
HFSC_N_QUEUES, /* n_queues */
3300
hfsc_tc_install, /* tc_install */
3301
hfsc_tc_load, /* tc_load */
3302
hfsc_tc_destroy, /* tc_destroy */
3303
hfsc_qdisc_get, /* qdisc_get */
3304
hfsc_qdisc_set, /* qdisc_set */
3305
hfsc_class_get, /* class_get */
3306
hfsc_class_set, /* class_set */
3307
hfsc_class_delete, /* class_delete */
3308
hfsc_class_get_stats, /* class_get_stats */
3309
hfsc_class_dump_stats /* class_dump_stats */
3312
/* "linux-default" traffic control class.
3314
* This class represents the default, unnamed Linux qdisc. It corresponds to
3315
* the "" (empty string) QoS type in the OVS database. */
3318
default_install__(struct netdev *netdev)
3320
struct netdev_dev_linux *netdev_dev =
3321
netdev_dev_linux_cast(netdev_get_dev(netdev));
3322
static struct tc *tc;
3325
tc = xmalloc(sizeof *tc);
3326
tc_init(tc, &tc_ops_default);
3328
netdev_dev->tc = tc;
3332
default_tc_install(struct netdev *netdev,
3333
const struct shash *details OVS_UNUSED)
3335
default_install__(netdev);
3340
default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3342
default_install__(netdev);
3346
static const struct tc_ops tc_ops_default = {
3347
NULL, /* linux_name */
3352
NULL, /* tc_destroy */
3353
NULL, /* qdisc_get */
3354
NULL, /* qdisc_set */
3355
NULL, /* class_get */
3356
NULL, /* class_set */
3357
NULL, /* class_delete */
3358
NULL, /* class_get_stats */
3359
NULL /* class_dump_stats */
3362
/* "linux-other" traffic control class.
3367
other_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
3369
struct netdev_dev_linux *netdev_dev =
3370
netdev_dev_linux_cast(netdev_get_dev(netdev));
3371
static struct tc *tc;
3374
tc = xmalloc(sizeof *tc);
3375
tc_init(tc, &tc_ops_other);
3377
netdev_dev->tc = tc;
3381
static const struct tc_ops tc_ops_other = {
3382
NULL, /* linux_name */
3383
"linux-other", /* ovs_name */
3385
NULL, /* tc_install */
3387
NULL, /* tc_destroy */
3388
NULL, /* qdisc_get */
3389
NULL, /* qdisc_set */
3390
NULL, /* class_get */
3391
NULL, /* class_set */
3392
NULL, /* class_delete */
3393
NULL, /* class_get_stats */
3394
NULL /* class_dump_stats */
3397
/* Traffic control. */
3399
/* Number of kernel "tc" ticks per second. */
3400
static double ticks_per_s;
3402
/* Number of kernel "jiffies" per second. This is used for the purpose of
3403
* computing buffer sizes. Generally kernel qdiscs need to be able to buffer
3404
* one jiffy's worth of data.
3406
* There are two possibilities here:
3408
* - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
3409
* approximate range of 100 to 1024. That means that we really need to
3410
* make sure that the qdisc can buffer that much data.
3412
* - 'buffer_hz' is an absurdly large number. That means that the kernel
3413
* has finely granular timers and there's no need to fudge additional room
3414
* for buffers. (There's no extra effort needed to implement that: the
3415
* large 'buffer_hz' is used as a divisor, so practically any number will
3416
* come out as 0 in the division. Small integer results in the case of
3417
* really high dividends won't have any real effect anyhow.)
3419
static unsigned int buffer_hz;
3421
/* Returns tc handle 'major':'minor'. */
3423
tc_make_handle(unsigned int major, unsigned int minor)
3425
return TC_H_MAKE(major << 16, minor);
3428
/* Returns the major number from 'handle'. */
3430
tc_get_major(unsigned int handle)
3432
return TC_H_MAJ(handle) >> 16;
3435
/* Returns the minor number from 'handle'. */
3437
tc_get_minor(unsigned int handle)
3439
return TC_H_MIN(handle);
3442
static struct tcmsg *
3443
tc_make_request(const struct netdev *netdev, int type, unsigned int flags,
3444
struct ofpbuf *request)
3446
struct tcmsg *tcmsg;
3450
error = get_ifindex(netdev, &ifindex);
3455
ofpbuf_init(request, 512);
3456
nl_msg_put_nlmsghdr(request, sizeof *tcmsg, type, NLM_F_REQUEST | flags);
3457
tcmsg = ofpbuf_put_zeros(request, sizeof *tcmsg);
3458
tcmsg->tcm_family = AF_UNSPEC;
3459
tcmsg->tcm_ifindex = ifindex;
3460
/* Caller should fill in tcmsg->tcm_handle. */
3461
/* Caller should fill in tcmsg->tcm_parent. */
3467
tc_transact(struct ofpbuf *request, struct ofpbuf **replyp)
3469
int error = nl_sock_transact(rtnl_sock, request, replyp);
3470
ofpbuf_uninit(request);
3477
/* The values in psched are not individually very meaningful, but they are
3478
* important. The tables below show some values seen in the wild.
3482
* - "c" has always been a constant 1000000 since at least Linux 2.4.14.
3483
* (Before that, there are hints that it was 1000000000.)
3485
* - "d" can be unrealistically large, see the comment on 'buffer_hz'
3489
* -----------------------------------
3490
* [1] 000c8000 000f4240 000f4240 00000064
3491
* [2] 000003e8 00000400 000f4240 3b9aca00
3492
* [3] 000003e8 00000400 000f4240 3b9aca00
3493
* [4] 000003e8 00000400 000f4240 00000064
3494
* [5] 000003e8 00000040 000f4240 3b9aca00
3495
* [6] 000003e8 00000040 000f4240 000000f9
3497
* a b c d ticks_per_s buffer_hz
3498
* ------- --------- ---------- ------------- ----------- -------------
3499
* [1] 819,200 1,000,000 1,000,000 100 819,200 100
3500
* [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3501
* [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
3502
* [4] 1,000 1,024 1,000,000 100 976,562 100
3503
* [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
3504
* [6] 1,000 64 1,000,000 249 15,625,000 249
3506
* [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
3507
* [2] 2.6.26-1-686-bigmem from Debian lenny
3508
* [3] 2.6.26-2-sparc64 from Debian lenny
3509
* [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
3510
* [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
3511
* [6] 2.6.34 from kernel.org on KVM
3513
static const char fn[] = "/proc/net/psched";
3514
unsigned int a, b, c, d;
3520
stream = fopen(fn, "r");
3522
VLOG_WARN("%s: open failed: %s", fn, strerror(errno));
3526
if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
3527
VLOG_WARN("%s: read failed", fn);
3531
VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
3535
VLOG_WARN("%s: invalid scheduler parameters", fn);
3539
ticks_per_s = (double) a * c / b;
3543
VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
3546
VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
3549
/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
3550
* rate of 'rate' bytes per second. */
3552
tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
3557
return (rate * ticks) / ticks_per_s;
3560
/* Returns the number of ticks that it would take to transmit 'size' bytes at a
3561
* rate of 'rate' bytes per second. */
3563
tc_bytes_to_ticks(unsigned int rate, unsigned int size)
3568
return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
3571
/* Returns the number of bytes that need to be reserved for qdisc buffering at
3572
* a transmission rate of 'rate' bytes per second. */
3574
tc_buffer_per_jiffy(unsigned int rate)
3579
return rate / buffer_hz;
3582
/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
3583
* e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
3584
* extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
3585
* stores NULL into it if it is absent.
3587
* '*kind' and '*options' point into 'msg', so they are owned by whoever owns
3590
* Returns 0 if successful, otherwise a positive errno value. */
3592
tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
3593
struct nlattr **options)
3595
static const struct nl_policy tca_policy[] = {
3596
[TCA_KIND] = { .type = NL_A_STRING, .optional = false },
3597
[TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
3599
struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3601
if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3602
tca_policy, ta, ARRAY_SIZE(ta))) {
3603
VLOG_WARN_RL(&rl, "failed to parse qdisc message");
3608
*kind = nl_attr_get_string(ta[TCA_KIND]);
3612
*options = ta[TCA_OPTIONS];
3627
/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
3628
* minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
3629
* into '*options', and its queue statistics into '*stats'. Any of the output
3630
* arguments may be null.
3632
* Returns 0 if successful, otherwise a positive errno value. */
3634
tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
3635
struct nlattr **options, struct netdev_queue_stats *stats)
3637
static const struct nl_policy tca_policy[] = {
3638
[TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
3639
[TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
3641
struct nlattr *ta[ARRAY_SIZE(tca_policy)];
3643
if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
3644
tca_policy, ta, ARRAY_SIZE(ta))) {
3645
VLOG_WARN_RL(&rl, "failed to parse class message");
3650
struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
3651
*handlep = tc->tcm_handle;
3655
*options = ta[TCA_OPTIONS];
3659
const struct gnet_stats_queue *gsq;
3660
struct gnet_stats_basic gsb;
3662
static const struct nl_policy stats_policy[] = {
3663
[TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
3664
.min_len = sizeof gsb },
3665
[TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
3666
.min_len = sizeof *gsq },
3668
struct nlattr *sa[ARRAY_SIZE(stats_policy)];
3670
if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
3671
sa, ARRAY_SIZE(sa))) {
3672
VLOG_WARN_RL(&rl, "failed to parse class stats");
3676
/* Alignment issues screw up the length of struct gnet_stats_basic on
3677
* some arch/bitsize combinations. Newer versions of Linux have a
3678
* struct gnet_stats_basic_packed, but we can't depend on that. The
3679
* easiest thing to do is just to make a copy. */
3680
memset(&gsb, 0, sizeof gsb);
3681
memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
3682
MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
3683
stats->tx_bytes = gsb.bytes;
3684
stats->tx_packets = gsb.packets;
3686
gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
3687
stats->tx_errors = gsq->drops;
3697
memset(stats, 0, sizeof *stats);
3702
/* Queries the kernel for class with identifier 'handle' and parent 'parent'
3705
tc_query_class(const struct netdev *netdev,
3706
unsigned int handle, unsigned int parent,
3707
struct ofpbuf **replyp)
3709
struct ofpbuf request;
3710
struct tcmsg *tcmsg;
3713
tcmsg = tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO, &request);
3717
tcmsg->tcm_handle = handle;
3718
tcmsg->tcm_parent = parent;
3720
error = tc_transact(&request, replyp);
3722
VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
3723
netdev_get_name(netdev),
3724
tc_get_major(handle), tc_get_minor(handle),
3725
tc_get_major(parent), tc_get_minor(parent),
3731
/* Equivalent to "tc class del dev <name> handle <handle>". */
3733
tc_delete_class(const struct netdev *netdev, unsigned int handle)
3735
struct ofpbuf request;
3736
struct tcmsg *tcmsg;
3739
tcmsg = tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
3743
tcmsg->tcm_handle = handle;
3744
tcmsg->tcm_parent = 0;
3746
error = tc_transact(&request, NULL);
3748
VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
3749
netdev_get_name(netdev),
3750
tc_get_major(handle), tc_get_minor(handle),
3756
/* Equivalent to "tc qdisc del dev <name> root". */
3758
tc_del_qdisc(struct netdev *netdev)
3760
struct netdev_dev_linux *netdev_dev =
3761
netdev_dev_linux_cast(netdev_get_dev(netdev));
3762
struct ofpbuf request;
3763
struct tcmsg *tcmsg;
3766
tcmsg = tc_make_request(netdev, RTM_DELQDISC, 0, &request);
3770
tcmsg->tcm_handle = tc_make_handle(1, 0);
3771
tcmsg->tcm_parent = TC_H_ROOT;
3773
error = tc_transact(&request, NULL);
3774
if (error == EINVAL) {
3775
/* EINVAL probably means that the default qdisc was in use, in which
3776
* case we've accomplished our purpose. */
3779
if (!error && netdev_dev->tc) {
3780
if (netdev_dev->tc->ops->tc_destroy) {
3781
netdev_dev->tc->ops->tc_destroy(netdev_dev->tc);
3783
netdev_dev->tc = NULL;
3788
/* If 'netdev''s qdisc type and parameters are not yet known, queries the
3789
* kernel to determine what they are. Returns 0 if successful, otherwise a
3790
* positive errno value. */
3792
tc_query_qdisc(const struct netdev *netdev)
3794
struct netdev_dev_linux *netdev_dev =
3795
netdev_dev_linux_cast(netdev_get_dev(netdev));
3796
struct ofpbuf request, *qdisc;
3797
const struct tc_ops *ops;
3798
struct tcmsg *tcmsg;
3802
if (netdev_dev->tc) {
3806
/* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
3807
* commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
3808
* 2.6.35 without that fix backported to it.
3810
* To avoid the OOPS, we must not make a request that would attempt to dump
3811
* a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
3812
* few others. There are a few ways that I can see to do this, but most of
3813
* them seem to be racy (and if you lose the race the kernel OOPSes). The
3814
* technique chosen here is to assume that any non-default qdisc that we
3815
* create will have a class with handle 1:0. The built-in qdiscs only have
3816
* a class with handle 0:0.
3818
* We could check for Linux 2.6.35+ and use a more straightforward method
3820
tcmsg = tc_make_request(netdev, RTM_GETQDISC, NLM_F_ECHO, &request);
3824
tcmsg->tcm_handle = tc_make_handle(1, 0);
3825
tcmsg->tcm_parent = 0;
3827
/* Figure out what tc class to instantiate. */
3828
error = tc_transact(&request, &qdisc);
3832
error = tc_parse_qdisc(qdisc, &kind, NULL);
3834
ops = &tc_ops_other;
3836
ops = tc_lookup_linux_name(kind);
3838
static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
3839
VLOG_INFO_RL(&rl2, "unknown qdisc \"%s\"", kind);
3841
ops = &tc_ops_other;
3844
} else if (error == ENOENT) {
3845
/* Either it's a built-in qdisc, or it's a qdisc set up by some
3846
* other entity that doesn't have a handle 1:0. We will assume
3847
* that it's the system default qdisc. */
3848
ops = &tc_ops_default;
3851
/* Who knows? Maybe the device got deleted. */
3852
VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
3853
netdev_get_name(netdev), strerror(error));
3854
ops = &tc_ops_other;
3857
/* Instantiate it. */
3858
load_error = ops->tc_load((struct netdev *) netdev, qdisc);
3859
assert((load_error == 0) == (netdev_dev->tc != NULL));
3860
ofpbuf_delete(qdisc);
3862
return error ? error : load_error;
3865
/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
3866
approximate the time to transmit packets of various lengths. For an MTU of
3867
256 or less, each entry is exact; for an MTU of 257 through 512, each entry
3868
represents two possible packet lengths; for a MTU of 513 through 1024, four
3869
possible lengths; and so on.
3871
Returns, for the specified 'mtu', the number of bits that packet lengths
3872
need to be shifted right to fit within such a 256-entry table. */
3874
tc_calc_cell_log(unsigned int mtu)
3879
mtu = ETH_PAYLOAD_MAX;
3881
mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
3883
for (cell_log = 0; mtu >= 256; cell_log++) {
3890
/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
3893
tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
3895
memset(rate, 0, sizeof *rate);
3896
rate->cell_log = tc_calc_cell_log(mtu);
3897
/* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
3898
/* rate->cell_align = 0; */ /* distro headers. */
3899
rate->mpu = ETH_TOTAL_MIN;
3903
/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
3904
* attribute of the specified "type".
3906
* See tc_calc_cell_log() above for a description of "rtab"s. */
3908
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
3913
rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
3914
for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
3915
unsigned packet_size = (i + 1) << rate->cell_log;
3916
if (packet_size < rate->mpu) {
3917
packet_size = rate->mpu;
3919
rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
3923
/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
3924
* rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
3925
* burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
3928
tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
3930
unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
3931
return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
3934
/* Linux-only functions declared in netdev-linux.h */
3936
/* Returns a fd for an AF_INET socket or a negative errno value. */
3938
netdev_linux_get_af_inet_sock(void)
3940
int error = netdev_linux_init();
3941
return error ? -error : af_inet_sock;
3944
/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
3945
* 'enable' is true, the bit is set. Otherwise, it is cleared. */
3947
netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
3948
const char *flag_name, bool enable)
3950
const char *netdev_name = netdev_get_name(netdev);
3951
struct ethtool_value evalue;
3955
memset(&evalue, 0, sizeof evalue);
3956
error = netdev_linux_do_ethtool(netdev_name,
3957
(struct ethtool_cmd *)&evalue,
3958
ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3963
evalue.data = new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
3964
error = netdev_linux_do_ethtool(netdev_name,
3965
(struct ethtool_cmd *)&evalue,
3966
ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
3971
memset(&evalue, 0, sizeof evalue);
3972
error = netdev_linux_do_ethtool(netdev_name,
3973
(struct ethtool_cmd *)&evalue,
3974
ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
3979
if (new_flags != evalue.data) {
3980
VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
3981
"device %s failed", enable ? "enable" : "disable",
3982
flag_name, netdev_name);
3989
/* Utility functions. */
3991
/* Copies 'src' into 'dst', performing format conversion in the process. */
3993
netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
3994
const struct rtnl_link_stats *src)
3996
dst->rx_packets = src->rx_packets;
3997
dst->tx_packets = src->tx_packets;
3998
dst->rx_bytes = src->rx_bytes;
3999
dst->tx_bytes = src->tx_bytes;
4000
dst->rx_errors = src->rx_errors;
4001
dst->tx_errors = src->tx_errors;
4002
dst->rx_dropped = src->rx_dropped;
4003
dst->tx_dropped = src->tx_dropped;
4004
dst->multicast = src->multicast;
4005
dst->collisions = src->collisions;
4006
dst->rx_length_errors = src->rx_length_errors;
4007
dst->rx_over_errors = src->rx_over_errors;
4008
dst->rx_crc_errors = src->rx_crc_errors;
4009
dst->rx_frame_errors = src->rx_frame_errors;
4010
dst->rx_fifo_errors = src->rx_fifo_errors;
4011
dst->rx_missed_errors = src->rx_missed_errors;
4012
dst->tx_aborted_errors = src->tx_aborted_errors;
4013
dst->tx_carrier_errors = src->tx_carrier_errors;
4014
dst->tx_fifo_errors = src->tx_fifo_errors;
4015
dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
4016
dst->tx_window_errors = src->tx_window_errors;
4020
get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
4022
/* Policy for RTNLGRP_LINK messages.
4024
* There are *many* more fields in these messages, but currently we only
4025
* care about these fields. */
4026
static const struct nl_policy rtnlgrp_link_policy[] = {
4027
[IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
4028
[IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
4029
.min_len = sizeof(struct rtnl_link_stats) },
4032
struct ofpbuf request;
4033
struct ofpbuf *reply;
4034
struct ifinfomsg *ifi;
4035
struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
4038
ofpbuf_init(&request, 0);
4039
nl_msg_put_nlmsghdr(&request, sizeof *ifi, RTM_GETLINK, NLM_F_REQUEST);
4040
ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
4041
ifi->ifi_family = PF_UNSPEC;
4042
ifi->ifi_index = ifindex;
4043
error = nl_sock_transact(rtnl_sock, &request, &reply);
4044
ofpbuf_uninit(&request);
4049
if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
4050
rtnlgrp_link_policy,
4051
attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
4052
ofpbuf_delete(reply);
4056
if (!attrs[IFLA_STATS]) {
4057
VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
4058
ofpbuf_delete(reply);
4062
netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(attrs[IFLA_STATS]));
4064
ofpbuf_delete(reply);
4070
get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
4072
static const char fn[] = "/proc/net/dev";
4077
stream = fopen(fn, "r");
4079
VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
4084
while (fgets(line, sizeof line, stream)) {
4087
#define X64 "%"SCNu64
4090
X64 X64 X64 X64 X64 X64 X64 "%*u"
4091
X64 X64 X64 X64 X64 X64 X64 "%*u",
4097
&stats->rx_fifo_errors,
4098
&stats->rx_frame_errors,
4104
&stats->tx_fifo_errors,
4106
&stats->tx_carrier_errors) != 15) {
4107
VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
4108
} else if (!strcmp(devname, netdev_name)) {
4109
stats->rx_length_errors = UINT64_MAX;
4110
stats->rx_over_errors = UINT64_MAX;
4111
stats->rx_crc_errors = UINT64_MAX;
4112
stats->rx_missed_errors = UINT64_MAX;
4113
stats->tx_aborted_errors = UINT64_MAX;
4114
stats->tx_heartbeat_errors = UINT64_MAX;
4115
stats->tx_window_errors = UINT64_MAX;
4121
VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
4127
get_carrier_via_sysfs(const char *name, bool *carrier)
4138
fn = xasprintf("/sys/class/net/%s/carrier", name);
4139
fd = open(fn, O_RDONLY);
4142
VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
4146
retval = read(fd, line, sizeof line);
4149
if (error == EINVAL) {
4150
/* This is the normal return value when we try to check carrier if
4151
* the network device is not up. */
4153
VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
4156
} else if (retval == 0) {
4158
VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
4162
if (line[0] != '0' && line[0] != '1') {
4164
VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
4167
*carrier = line[0] != '0';
4179
get_flags(const struct netdev *netdev, int *flags)
4184
error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCGIFFLAGS,
4186
*flags = ifr.ifr_flags;
4191
set_flags(struct netdev *netdev, int flags)
4195
ifr.ifr_flags = flags;
4196
return netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, SIOCSIFFLAGS,
4201
do_get_ifindex(const char *netdev_name)
4205
ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4206
COVERAGE_INC(netdev_get_ifindex);
4207
if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
4208
VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
4209
netdev_name, strerror(errno));
4212
return ifr.ifr_ifindex;
4216
get_ifindex(const struct netdev *netdev_, int *ifindexp)
4218
struct netdev_dev_linux *netdev_dev =
4219
netdev_dev_linux_cast(netdev_get_dev(netdev_));
4221
if (!(netdev_dev->cache_valid & VALID_IFINDEX)) {
4222
int ifindex = do_get_ifindex(netdev_get_name(netdev_));
4226
netdev_dev->cache_valid |= VALID_IFINDEX;
4227
netdev_dev->ifindex = ifindex;
4229
*ifindexp = netdev_dev->ifindex;
4234
get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN])
4239
memset(&ifr, 0, sizeof ifr);
4240
ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4241
COVERAGE_INC(netdev_get_hwaddr);
4242
if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
4243
/* ENODEV probably means that a vif disappeared asynchronously and
4244
* hasn't been removed from the database yet, so reduce the log level
4245
* to INFO for that case. */
4246
VLOG(errno == ENODEV ? VLL_INFO : VLL_ERR,
4247
"ioctl(SIOCGIFHWADDR) on %s device failed: %s",
4248
netdev_name, strerror(errno));
4251
hwaddr_family = ifr.ifr_hwaddr.sa_family;
4252
if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
4253
VLOG_WARN("%s device has unknown hardware address family %d",
4254
netdev_name, hwaddr_family);
4256
memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
4261
set_etheraddr(const char *netdev_name, int hwaddr_family,
4262
const uint8_t mac[ETH_ADDR_LEN])
4266
memset(&ifr, 0, sizeof ifr);
4267
ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
4268
ifr.ifr_hwaddr.sa_family = hwaddr_family;
4269
memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
4270
COVERAGE_INC(netdev_set_hwaddr);
4271
if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
4272
VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
4273
netdev_name, strerror(errno));
4280
netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
4281
int cmd, const char *cmd_name)
4285
memset(&ifr, 0, sizeof ifr);
4286
ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
4287
ifr.ifr_data = (caddr_t) ecmd;
4290
COVERAGE_INC(netdev_ethtool);
4291
if (ioctl(af_inet_sock, SIOCETHTOOL, &ifr) == 0) {
4294
if (errno != EOPNOTSUPP) {
4295
VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
4296
"failed: %s", cmd_name, name, strerror(errno));
4298
/* The device doesn't support this operation. That's pretty
4299
* common, so there's no point in logging anything. */
4306
netdev_linux_do_ioctl(const char *name, struct ifreq *ifr, int cmd,
4307
const char *cmd_name)
4309
ovs_strzcpy(ifr->ifr_name, name, sizeof ifr->ifr_name);
4310
if (ioctl(af_inet_sock, cmd, ifr) == -1) {
4311
VLOG_DBG_RL(&rl, "%s: ioctl(%s) failed: %s", name, cmd_name,
4319
netdev_linux_get_ipv4(const struct netdev *netdev, struct in_addr *ip,
4320
int cmd, const char *cmd_name)
4325
ifr.ifr_addr.sa_family = AF_INET;
4326
error = netdev_linux_do_ioctl(netdev_get_name(netdev), &ifr, cmd, cmd_name);
4328
const struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
4329
*ip = sin->sin_addr;
4334
/* Returns an AF_PACKET raw socket or a negative errno value. */
4336
af_packet_sock(void)
4338
static int sock = INT_MIN;
4340
if (sock == INT_MIN) {
4341
sock = socket(AF_PACKET, SOCK_RAW, 0);
4343
set_nonblocking(sock);
4346
VLOG_ERR("failed to create packet socket: %s", strerror(errno));