diff options
author | Simon Horman <horms@verge.net.au> | 2014-01-15 17:17:02 +0900 |
---|---|---|
committer | Ben Pfaff <blp@nicira.com> | 2014-01-16 15:09:14 -0800 |
commit | b73c85181df9cc38231a42d6f8095dcb604d230a (patch) | |
tree | 7c3d4f9363ad681e7f001e008cf71a2e661d5725 | |
parent | 1bf02876a46e3e86a59f959fdac57db7f6b0a4ad (diff) | |
download | openvswitch-b73c85181df9cc38231a42d6f8095dcb604d230a.tar.gz |
netdev-linux: Read packet auxdata to obtain vlan_tid
If VLAN acceleration is used when the kernel receives a packet
then the outer-most VLAN tag will not be present in the packet
when it is received by netdev-linux. Rather, it will be present
in auxdata.
This patch uses recvmsg() instead of recv() to read auxdata for
each packet and if the vlan_tid is set then it is added to the packet.
Adding the vlan_tid makes use of headroom available
in the buffer parameter of rx_recv.
Signed-off-by: Simon Horman <horms@verge.net.au>
Co-authored-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
-rw-r--r-- | include/sparse/sys/socket.h | 1 | ||||
-rw-r--r-- | lib/netdev-linux.c | 156 | ||||
-rw-r--r-- | lib/netdev-provider.h | 10 | ||||
-rw-r--r-- | lib/netdev.c | 34 |
4 files changed, 184 insertions, 17 deletions
diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h index 75ee43c6c..3212bf4b7 100644 --- a/include/sparse/sys/socket.h +++ b/include/sparse/sys/socket.h @@ -87,6 +87,7 @@ enum { }; enum { + SOL_PACKET, SOL_SOCKET }; diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 106c18a3d..9c1a36db1 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -20,11 +20,11 @@ #include <errno.h> #include <fcntl.h> -#include <arpa/inet.h> #include <inttypes.h> #include <linux/filter.h> #include <linux/gen_stats.h> #include <linux/if_ether.h> +#include <linux/if_packet.h> #include <linux/if_tun.h> #include <linux/types.h> #include <linux/ethtool.h> @@ -37,10 +37,8 @@ #include <sys/types.h> #include <sys/ioctl.h> #include <sys/socket.h> -#include <netpacket/packet.h> #include <net/if.h> #include <net/if_arp.h> -#include <net/if_packet.h> #include <net/route.h> #include <netinet/in.h> #include <poll.h> @@ -109,6 +107,33 @@ COVERAGE_DEFINE(netdev_set_ethtool); #define TC_RTAB_SIZE 1024 #endif +/* Linux 2.6.21 introduced struct tpacket_auxdata. + * Linux 2.6.27 added the tp_vlan_tci member. + * Linux 3.0 defined TP_STATUS_VLAN_VALID. + * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined + * TP_STATUS_VLAN_TPID_VALID. + * + * With all this churn it's easiest to unconditionally define a replacement + * structure that has everything we want. + */ +#ifndef TP_STATUS_VLAN_VALID +#define TP_STATUS_VLAN_VALID (1 << 4) +#endif +#ifndef TP_STATUS_VLAN_TPID_VALID +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) +#endif +#undef tpacket_auxdata +#define tpacket_auxdata rpl_tpacket_auxdata +struct tpacket_auxdata { + uint32_t tp_status; + uint32_t tp_len; + uint32_t tp_snaplen; + uint16_t tp_mac; + uint16_t tp_net; + uint16_t tp_vlan_tci; + uint16_t tp_vlan_tpid; +}; + enum { VALID_IFINDEX = 1 << 0, VALID_ETHERADDR = 1 << 1, @@ -763,7 +788,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) rx->fd = netdev->tap_fd; } else { struct sockaddr_ll sll; - int ifindex; + int ifindex, val; /* Result of tcpdump -dd inbound */ static const struct sock_filter filt[] = { { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ @@ -783,6 +808,14 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) goto error; } + val = 1; + if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) { + error = errno; + VLOG_ERR("%s: failed to mark socket for auxdata (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + /* Set non-blocking mode. */ error = set_nonblocking(rx->fd); if (error) { @@ -799,7 +832,7 @@ netdev_linux_rx_construct(struct netdev_rx *rx_) memset(&sll, 0, sizeof sll); sll.sll_family = AF_PACKET; sll.sll_ifindex = ifindex; - sll.sll_protocol = (OVS_FORCE unsigned short int) htons(ETH_P_ALL); + sll.sll_protocol = htons(ETH_P_ALL); if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { error = errno; VLOG_ERR("%s: failed to bind raw socket (%s)", @@ -847,31 +880,120 @@ netdev_linux_rx_dealloc(struct netdev_rx *rx_) free(rx); } +static ovs_be16 +auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux) +{ + if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) { + return htons(aux->tp_vlan_tpid); + } else { + return htons(ETH_TYPE_VLAN); + } +} + +static bool +auxdata_has_vlan_tci(const struct tpacket_auxdata *aux) +{ + return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID; +} + static int -netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer) +netdev_linux_rx_recv_sock(int fd, struct ofpbuf *buffer) { - struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + size_t size; ssize_t retval; - size_t size = ofpbuf_tailroom(buffer); + struct iovec iov; + struct cmsghdr *cmsg; + union { + struct cmsghdr cmsg; + char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))]; + } cmsg_buffer; + struct msghdr msgh; + + /* Reserve headroom for a single VLAN tag */ + ofpbuf_reserve(buffer, VLAN_HEADER_LEN); + size = ofpbuf_tailroom(buffer); + + iov.iov_base = buffer->data; + iov.iov_len = size; + msgh.msg_name = NULL; + msgh.msg_namelen = 0; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = &cmsg_buffer; + msgh.msg_controllen = sizeof cmsg_buffer; + msgh.msg_flags = 0; do { - retval = (rx->is_tap - ? read(rx->fd, buffer->data, size) - : recv(rx->fd, buffer->data, size, MSG_TRUNC)); + retval = recvmsg(fd, &msgh, MSG_TRUNC); } while (retval < 0 && errno == EINTR); if (retval < 0) { - if (errno != EAGAIN) { - VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s", - ovs_strerror(errno), netdev_rx_get_name(rx_)); + return errno; + } else if (retval > size) { + return EMSGSIZE; + } + + buffer->size += retval; + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + const struct tpacket_auxdata *aux; + + if (cmsg->cmsg_level != SOL_PACKET + || cmsg->cmsg_type != PACKET_AUXDATA + || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) { + continue; + } + + aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg)); + if (auxdata_has_vlan_tci(aux)) { + if (retval < ETH_HEADER_LEN) { + return EINVAL; + } + + eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux), + htons(aux->tp_vlan_tci)); + break; } + } + + return 0; +} + +static int +netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer) +{ + ssize_t retval; + size_t size = ofpbuf_tailroom(buffer); + + do { + retval = read(fd, buffer->data, size); + } while (retval < 0 && errno == EINTR); + + if (retval < 0) { return errno; } else if (retval > size) { return EMSGSIZE; - } else { - buffer->size += retval; - return 0; } + + buffer->size += retval; + return 0; +} + +static int +netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer) +{ + struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_); + int retval; + + retval = (rx->is_tap + ? netdev_linux_rx_recv_tap(rx->fd, buffer) + : netdev_linux_rx_recv_sock(rx->fd, buffer)); + if (retval && retval != EAGAIN && retval != EMSGSIZE) { + VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s", + ovs_strerror(errno), netdev_rx_get_name(rx_)); + } + + return retval; } static void diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index 1dcc1f45f..673d3aba7 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -643,6 +643,16 @@ struct netdev_class { * Must return EMSGSIZE, and discard the packet, if the received packet * is longer than 'ofpbuf_tailroom(buffer)'. * + * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to + * add a VLAN header which is obtained out-of-band to the packet. If + * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be + * available for the packet, otherwise it may be used for the packet + * itself. + * + * It is advised that the tailroom of 'buffer' should be + * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an + * out-of-band VLAN header to be added to the packet. + * * This function may be set to null if it would always return EOPNOTSUPP * anyhow. */ int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf *buffer); diff --git a/lib/netdev.c b/lib/netdev.c index 66b411a44..8e6242170 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -500,6 +500,13 @@ netdev_parse_name(const char *netdev_name_, char **name, char **type) } } +/* Attempts to open a netdev_rx handle for obtaining packets received on + * 'netdev'. On success, returns 0 and stores a nonnull 'netdev_rx *' into + * '*rxp'. On failure, returns a positive errno value and stores NULL into + * '*rxp'. + * + * Some kinds of network devices might not support receiving packets. This + * function returns EOPNOTSUPP in that case.*/ int netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp) OVS_EXCLUDED(netdev_mutex) @@ -531,6 +538,7 @@ netdev_rx_open(struct netdev *netdev, struct netdev_rx **rxp) return error; } +/* Closes 'rx'. */ void netdev_rx_close(struct netdev_rx *rx) OVS_EXCLUDED(netdev_mutex) @@ -543,6 +551,29 @@ netdev_rx_close(struct netdev_rx *rx) } } +/* Attempts to receive a packet from 'rx' into the tailroom of 'buffer', which + * must initially be empty. If successful, returns 0 and increments + * 'buffer->size' by the number of bytes in the received packet, otherwise a + * positive errno value. + * + * Returns EAGAIN immediately if no packet is ready to be received. + * + * Returns EMSGSIZE, and discards the packet, if the received packet is longer + * than 'ofpbuf_tailroom(buffer)'. + * + * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to + * add a VLAN header which is obtained out-of-band to the packet. If + * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be + * available for the packet, otherwise it may be used for the packet + * itself. + * + * It is advised that the tailroom of 'buffer' should be + * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an + * out-of-band VLAN header to be added to the packet. At the very least, + * 'buffer' must have at least ETH_TOTAL_MIN bytes of tailroom. + * + * This function may be set to null if it would always return EOPNOTSUPP + * anyhow. */ int netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf *buffer) { @@ -563,12 +594,15 @@ netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf *buffer) } } +/* Arranges for poll_block() to wake up when a packet is ready to be received + * on 'rx'. */ void netdev_rx_wait(struct netdev_rx *rx) { rx->netdev->netdev_class->rx_wait(rx); } +/* Discards any packets ready to be received on 'rx'. */ int netdev_rx_drain(struct netdev_rx *rx) { |