/* * Copyright (c) 2010, 2011, 2012, 2013, 2014 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "netdev-vport.h" #include #include #include #include #include #include #include #include "byte-order.h" #include "csum.h" #include "daemon.h" #include "dirs.h" #include "dpif.h" #include "dp-packet.h" #include "openvswitch/dynamic-string.h" #include "flow.h" #include "hash.h" #include "hmap.h" #include "list.h" #include "netdev-provider.h" #include "odp-netlink.h" #include "dp-packet.h" #include "ovs-router.h" #include "packets.h" #include "poll-loop.h" #include "route-table.h" #include "shash.h" #include "socket-util.h" #include "openvswitch/vlog.h" #include "unaligned.h" #include "unixctl.h" #include "util.h" VLOG_DEFINE_THIS_MODULE(netdev_vport); static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5); #define GENEVE_DST_PORT 6081 #define VXLAN_DST_PORT 4789 #define LISP_DST_PORT 4341 #define STT_DST_PORT 7471 #define VXLAN_HLEN (sizeof(struct udp_header) + \ sizeof(struct vxlanhdr)) #define GENEVE_BASE_HLEN (sizeof(struct udp_header) + \ sizeof(struct genevehdr)) #define DEFAULT_TTL 64 struct netdev_vport { struct netdev up; /* Protects all members below. */ struct ovs_mutex mutex; struct eth_addr etheraddr; struct netdev_stats stats; /* Tunnels. */ struct netdev_tunnel_config tnl_cfg; char egress_iface[IFNAMSIZ]; bool carrier_status; /* Patch Ports. */ char *peer; }; struct vport_class { const char *dpif_port; struct netdev_class netdev_class; }; /* Last read of the route-table's change number. */ static uint64_t rt_change_seqno; static int netdev_vport_construct(struct netdev *); static int get_patch_config(const struct netdev *netdev, struct smap *args); static int get_tunnel_config(const struct netdev *, struct smap *args); static bool tunnel_check_status_change__(struct netdev_vport *); static uint16_t tnl_udp_port_min = 32768; static uint16_t tnl_udp_port_max = 61000; static bool is_vport_class(const struct netdev_class *class) { return class->construct == netdev_vport_construct; } bool netdev_vport_is_vport_class(const struct netdev_class *class) { return is_vport_class(class); } static const struct vport_class * vport_class_cast(const struct netdev_class *class) { ovs_assert(is_vport_class(class)); return CONTAINER_OF(class, struct vport_class, netdev_class); } static struct netdev_vport * netdev_vport_cast(const struct netdev *netdev) { ovs_assert(is_vport_class(netdev_get_class(netdev))); return CONTAINER_OF(netdev, struct netdev_vport, up); } static const struct netdev_tunnel_config * get_netdev_tunnel_config(const struct netdev *netdev) { return &netdev_vport_cast(netdev)->tnl_cfg; } bool netdev_vport_is_patch(const struct netdev *netdev) { const struct netdev_class *class = netdev_get_class(netdev); return class->get_config == get_patch_config; } bool netdev_vport_is_layer3(const struct netdev *dev) { const char *type = netdev_get_type(dev); return (!strcmp("lisp", type)); } static bool netdev_vport_needs_dst_port(const struct netdev *dev) { const struct netdev_class *class = netdev_get_class(dev); const char *type = netdev_get_type(dev); return (class->get_config == get_tunnel_config && (!strcmp("geneve", type) || !strcmp("vxlan", type) || !strcmp("lisp", type) || !strcmp("stt", type)) ); } const char * netdev_vport_class_get_dpif_port(const struct netdev_class *class) { return is_vport_class(class) ? vport_class_cast(class)->dpif_port : NULL; } const char * netdev_vport_get_dpif_port(const struct netdev *netdev, char namebuf[], size_t bufsize) { const struct netdev_class *class = netdev_get_class(netdev); const char *dpif_port = netdev_vport_class_get_dpif_port(class); if (!dpif_port) { return netdev_get_name(netdev); } if (netdev_vport_needs_dst_port(netdev)) { const struct netdev_vport *vport = netdev_vport_cast(netdev); /* * Note: IFNAMSIZ is 16 bytes long. Implementations should choose * a dpif port name that is short enough to fit including any * port numbers but assert just in case. */ BUILD_ASSERT(NETDEV_VPORT_NAME_BUFSIZE >= IFNAMSIZ); ovs_assert(strlen(dpif_port) + 6 < IFNAMSIZ); snprintf(namebuf, bufsize, "%s_%d", dpif_port, ntohs(vport->tnl_cfg.dst_port)); return namebuf; } else { return dpif_port; } } char * netdev_vport_get_dpif_port_strdup(const struct netdev *netdev) { char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; return xstrdup(netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf)); } /* Whenever the route-table change number is incremented, * netdev_vport_route_changed() should be called to update * the corresponding tunnel interface status. */ static void netdev_vport_route_changed(void) { struct netdev **vports; size_t i, n_vports; vports = netdev_get_vports(&n_vports); for (i = 0; i < n_vports; i++) { struct netdev *netdev_ = vports[i]; struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); /* Finds all tunnel vports. */ if (ipv6_addr_is_set(&netdev->tnl_cfg.ipv6_dst)) { if (tunnel_check_status_change__(netdev)) { netdev_change_seq_changed(netdev_); } } ovs_mutex_unlock(&netdev->mutex); netdev_close(netdev_); } free(vports); } static struct netdev * netdev_vport_alloc(void) { struct netdev_vport *netdev = xzalloc(sizeof *netdev); return &netdev->up; } static int netdev_vport_construct(struct netdev *netdev_) { struct netdev_vport *dev = netdev_vport_cast(netdev_); const char *type = netdev_get_type(netdev_); ovs_mutex_init(&dev->mutex); eth_addr_random(&dev->etheraddr); /* Add a default destination port for tunnel ports if none specified. */ if (!strcmp(type, "geneve")) { dev->tnl_cfg.dst_port = htons(GENEVE_DST_PORT); } else if (!strcmp(type, "vxlan")) { dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT); } else if (!strcmp(type, "lisp")) { dev->tnl_cfg.dst_port = htons(LISP_DST_PORT); } else if (!strcmp(type, "stt")) { dev->tnl_cfg.dst_port = htons(STT_DST_PORT); } dev->tnl_cfg.dont_fragment = true; dev->tnl_cfg.ttl = DEFAULT_TTL; return 0; } static void netdev_vport_destruct(struct netdev *netdev_) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); free(netdev->peer); ovs_mutex_destroy(&netdev->mutex); } static void netdev_vport_dealloc(struct netdev *netdev_) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); free(netdev); } static int netdev_vport_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); netdev->etheraddr = mac; ovs_mutex_unlock(&netdev->mutex); netdev_change_seq_changed(netdev_); return 0; } static int netdev_vport_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); *mac = netdev->etheraddr; ovs_mutex_unlock(&netdev->mutex); return 0; } /* Checks if the tunnel status has changed and returns a boolean. * Updates the tunnel status if it has changed. */ static bool tunnel_check_status_change__(struct netdev_vport *netdev) OVS_REQUIRES(netdev->mutex) { char iface[IFNAMSIZ]; bool status = false; struct in6_addr *route; struct in6_addr gw; iface[0] = '\0'; route = &netdev->tnl_cfg.ipv6_dst; if (ovs_router_lookup(route, iface, NULL, &gw)) { struct netdev *egress_netdev; if (!netdev_open(iface, "system", &egress_netdev)) { status = netdev_get_carrier(egress_netdev); netdev_close(egress_netdev); } } if (strcmp(netdev->egress_iface, iface) || netdev->carrier_status != status) { ovs_strlcpy(netdev->egress_iface, iface, IFNAMSIZ); netdev->carrier_status = status; return true; } return false; } static int tunnel_get_status(const struct netdev *netdev_, struct smap *smap) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); if (netdev->egress_iface[0]) { smap_add(smap, "tunnel_egress_iface", netdev->egress_iface); smap_add(smap, "tunnel_egress_iface_carrier", netdev->carrier_status ? "up" : "down"); } return 0; } static int netdev_vport_update_flags(struct netdev *netdev OVS_UNUSED, enum netdev_flags off, enum netdev_flags on OVS_UNUSED, enum netdev_flags *old_flagsp) { if (off & (NETDEV_UP | NETDEV_PROMISC)) { return EOPNOTSUPP; } *old_flagsp = NETDEV_UP | NETDEV_PROMISC; return 0; } static void netdev_vport_run(void) { uint64_t seq; route_table_run(); seq = route_table_get_change_seq(); if (rt_change_seqno != seq) { rt_change_seqno = seq; netdev_vport_route_changed(); } } static void netdev_vport_wait(void) { uint64_t seq; route_table_wait(); seq = route_table_get_change_seq(); if (rt_change_seqno != seq) { poll_immediate_wake(); } } /* Code specific to tunnel types. */ static ovs_be64 parse_key(const struct smap *args, const char *name, bool *present, bool *flow) { const char *s; *present = false; *flow = false; s = smap_get(args, name); if (!s) { s = smap_get(args, "key"); if (!s) { return 0; } } *present = true; if (!strcmp(s, "flow")) { *flow = true; return 0; } else { return htonll(strtoull(s, NULL, 0)); } } static int parse_tunnel_ip(const char *value, bool accept_mcast, bool *flow, struct in6_addr *ipv6, uint16_t *protocol) { if (!strcmp(value, "flow")) { *flow = true; *protocol = 0; return 0; } if (addr_is_ipv6(value)) { if (lookup_ipv6(value, ipv6)) { return ENOENT; } if (!accept_mcast && ipv6_addr_is_multicast(ipv6)) { return EINVAL; } *protocol = ETH_TYPE_IPV6; } else { struct in_addr ip; if (lookup_ip(value, &ip)) { return ENOENT; } if (!accept_mcast && ip_is_multicast(ip.s_addr)) { return EINVAL; } in6_addr_set_mapped_ipv4(ipv6, ip.s_addr); *protocol = ETH_TYPE_IP; } return 0; } static int set_tunnel_config(struct netdev *dev_, const struct smap *args) { struct netdev_vport *dev = netdev_vport_cast(dev_); const char *name = netdev_get_name(dev_); const char *type = netdev_get_type(dev_); bool ipsec_mech_set, needs_dst_port, has_csum; uint16_t dst_proto = 0, src_proto = 0; struct netdev_tunnel_config tnl_cfg; struct smap_node *node; has_csum = strstr(type, "gre") || strstr(type, "geneve") || strstr(type, "stt") || strstr(type, "vxlan"); ipsec_mech_set = false; memset(&tnl_cfg, 0, sizeof tnl_cfg); /* Add a default destination port for tunnel ports if none specified. */ if (!strcmp(type, "geneve")) { tnl_cfg.dst_port = htons(GENEVE_DST_PORT); } if (!strcmp(type, "vxlan")) { tnl_cfg.dst_port = htons(VXLAN_DST_PORT); } if (!strcmp(type, "lisp")) { tnl_cfg.dst_port = htons(LISP_DST_PORT); } if (!strcmp(type, "stt")) { tnl_cfg.dst_port = htons(STT_DST_PORT); } needs_dst_port = netdev_vport_needs_dst_port(dev_); tnl_cfg.ipsec = strstr(type, "ipsec"); tnl_cfg.dont_fragment = true; SMAP_FOR_EACH (node, args) { if (!strcmp(node->key, "remote_ip")) { int err; err = parse_tunnel_ip(node->value, false, &tnl_cfg.ip_dst_flow, &tnl_cfg.ipv6_dst, &dst_proto); switch (err) { case ENOENT: VLOG_WARN("%s: bad %s 'remote_ip'", name, type); break; case EINVAL: VLOG_WARN("%s: multicast remote_ip=%s not allowed", name, node->value); return EINVAL; } } else if (!strcmp(node->key, "local_ip")) { int err; err = parse_tunnel_ip(node->value, true, &tnl_cfg.ip_src_flow, &tnl_cfg.ipv6_src, &src_proto); switch (err) { case ENOENT: VLOG_WARN("%s: bad %s 'local_ip'", name, type); break; } } else if (!strcmp(node->key, "tos")) { if (!strcmp(node->value, "inherit")) { tnl_cfg.tos_inherit = true; } else { char *endptr; int tos; tos = strtol(node->value, &endptr, 0); if (*endptr == '\0' && tos == (tos & IP_DSCP_MASK)) { tnl_cfg.tos = tos; } else { VLOG_WARN("%s: invalid TOS %s", name, node->value); } } } else if (!strcmp(node->key, "ttl")) { if (!strcmp(node->value, "inherit")) { tnl_cfg.ttl_inherit = true; } else { tnl_cfg.ttl = atoi(node->value); } } else if (!strcmp(node->key, "dst_port") && needs_dst_port) { tnl_cfg.dst_port = htons(atoi(node->value)); } else if (!strcmp(node->key, "csum") && has_csum) { if (!strcmp(node->value, "true")) { tnl_cfg.csum = true; } } else if (!strcmp(node->key, "df_default")) { if (!strcmp(node->value, "false")) { tnl_cfg.dont_fragment = false; } } else if (!strcmp(node->key, "peer_cert") && tnl_cfg.ipsec) { if (smap_get(args, "certificate")) { ipsec_mech_set = true; } else { const char *use_ssl_cert; /* If the "use_ssl_cert" is true, then "certificate" and * "private_key" will be pulled from the SSL table. The * use of this option is strongly discouraged, since it * will like be removed when multiple SSL configurations * are supported by OVS. */ use_ssl_cert = smap_get(args, "use_ssl_cert"); if (!use_ssl_cert || strcmp(use_ssl_cert, "true")) { VLOG_ERR("%s: 'peer_cert' requires 'certificate' argument", name); return EINVAL; } ipsec_mech_set = true; } } else if (!strcmp(node->key, "psk") && tnl_cfg.ipsec) { ipsec_mech_set = true; } else if (tnl_cfg.ipsec && (!strcmp(node->key, "certificate") || !strcmp(node->key, "private_key") || !strcmp(node->key, "use_ssl_cert"))) { /* Ignore options not used by the netdev. */ } else if (!strcmp(node->key, "key") || !strcmp(node->key, "in_key") || !strcmp(node->key, "out_key")) { /* Handled separately below. */ } else if (!strcmp(node->key, "exts")) { char *str = xstrdup(node->value); char *ext, *save_ptr = NULL; tnl_cfg.exts = 0; ext = strtok_r(str, ",", &save_ptr); while (ext) { if (!strcmp(type, "vxlan") && !strcmp(ext, "gbp")) { tnl_cfg.exts |= (1 << OVS_VXLAN_EXT_GBP); } else { VLOG_WARN("%s: unknown extension '%s'", name, ext); } ext = strtok_r(NULL, ",", &save_ptr); } free(str); } else { VLOG_WARN("%s: unknown %s argument '%s'", name, type, node->key); } } if (tnl_cfg.ipsec) { static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; static pid_t pid = 0; #ifndef _WIN32 ovs_mutex_lock(&mutex); if (pid <= 0) { char *file_name = xasprintf("%s/%s", ovs_rundir(), "ovs-monitor-ipsec.pid"); pid = read_pidfile(file_name); free(file_name); } ovs_mutex_unlock(&mutex); #endif if (pid < 0) { VLOG_ERR("%s: IPsec requires the ovs-monitor-ipsec daemon", name); return EINVAL; } if (smap_get(args, "peer_cert") && smap_get(args, "psk")) { VLOG_ERR("%s: cannot define both 'peer_cert' and 'psk'", name); return EINVAL; } if (!ipsec_mech_set) { VLOG_ERR("%s: IPsec requires an 'peer_cert' or psk' argument", name); return EINVAL; } } if (!ipv6_addr_is_set(&tnl_cfg.ipv6_dst) && !tnl_cfg.ip_dst_flow) { VLOG_ERR("%s: %s type requires valid 'remote_ip' argument", name, type); return EINVAL; } if (tnl_cfg.ip_src_flow && !tnl_cfg.ip_dst_flow) { VLOG_ERR("%s: %s type requires 'remote_ip=flow' with 'local_ip=flow'", name, type); return EINVAL; } if (src_proto && dst_proto && src_proto != dst_proto) { VLOG_ERR("%s: 'remote_ip' and 'local_ip' has to be of the same address family", name); return EINVAL; } if (!tnl_cfg.ttl) { tnl_cfg.ttl = DEFAULT_TTL; } tnl_cfg.in_key = parse_key(args, "in_key", &tnl_cfg.in_key_present, &tnl_cfg.in_key_flow); tnl_cfg.out_key = parse_key(args, "out_key", &tnl_cfg.out_key_present, &tnl_cfg.out_key_flow); ovs_mutex_lock(&dev->mutex); if (memcmp(&dev->tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) { dev->tnl_cfg = tnl_cfg; tunnel_check_status_change__(dev); netdev_change_seq_changed(dev_); } ovs_mutex_unlock(&dev->mutex); return 0; } static int get_tunnel_config(const struct netdev *dev, struct smap *args) { struct netdev_vport *netdev = netdev_vport_cast(dev); struct netdev_tunnel_config tnl_cfg; ovs_mutex_lock(&netdev->mutex); tnl_cfg = netdev->tnl_cfg; ovs_mutex_unlock(&netdev->mutex); if (ipv6_addr_is_set(&tnl_cfg.ipv6_dst)) { smap_add_ipv6(args, "remote_ip", &tnl_cfg.ipv6_dst); } else if (tnl_cfg.ip_dst_flow) { smap_add(args, "remote_ip", "flow"); } if (ipv6_addr_is_set(&tnl_cfg.ipv6_src)) { smap_add_ipv6(args, "local_ip", &tnl_cfg.ipv6_src); } else if (tnl_cfg.ip_src_flow) { smap_add(args, "local_ip", "flow"); } if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) { smap_add(args, "key", "flow"); } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present && tnl_cfg.in_key == tnl_cfg.out_key) { smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key)); } else { if (tnl_cfg.in_key_flow) { smap_add(args, "in_key", "flow"); } else if (tnl_cfg.in_key_present) { smap_add_format(args, "in_key", "%"PRIu64, ntohll(tnl_cfg.in_key)); } if (tnl_cfg.out_key_flow) { smap_add(args, "out_key", "flow"); } else if (tnl_cfg.out_key_present) { smap_add_format(args, "out_key", "%"PRIu64, ntohll(tnl_cfg.out_key)); } } if (tnl_cfg.ttl_inherit) { smap_add(args, "ttl", "inherit"); } else if (tnl_cfg.ttl != DEFAULT_TTL) { smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl); } if (tnl_cfg.tos_inherit) { smap_add(args, "tos", "inherit"); } else if (tnl_cfg.tos) { smap_add_format(args, "tos", "0x%x", tnl_cfg.tos); } if (tnl_cfg.dst_port) { uint16_t dst_port = ntohs(tnl_cfg.dst_port); const char *type = netdev_get_type(dev); if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) || (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || (!strcmp("lisp", type) && dst_port != LISP_DST_PORT) || (!strcmp("stt", type) && dst_port != STT_DST_PORT)) { smap_add_format(args, "dst_port", "%d", dst_port); } } if (tnl_cfg.csum) { smap_add(args, "csum", "true"); } if (!tnl_cfg.dont_fragment) { smap_add(args, "df_default", "false"); } return 0; } /* Code specific to patch ports. */ /* If 'netdev' is a patch port, returns the name of its peer as a malloc()'d * string that the caller must free. * * If 'netdev' is not a patch port, returns NULL. */ char * netdev_vport_patch_peer(const struct netdev *netdev_) { char *peer = NULL; if (netdev_vport_is_patch(netdev_)) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); if (netdev->peer) { peer = xstrdup(netdev->peer); } ovs_mutex_unlock(&netdev->mutex); } return peer; } void netdev_vport_inc_rx(const struct netdev *netdev, const struct dpif_flow_stats *stats) { if (is_vport_class(netdev_get_class(netdev))) { struct netdev_vport *dev = netdev_vport_cast(netdev); ovs_mutex_lock(&dev->mutex); dev->stats.rx_packets += stats->n_packets; dev->stats.rx_bytes += stats->n_bytes; ovs_mutex_unlock(&dev->mutex); } } void netdev_vport_inc_tx(const struct netdev *netdev, const struct dpif_flow_stats *stats) { if (is_vport_class(netdev_get_class(netdev))) { struct netdev_vport *dev = netdev_vport_cast(netdev); ovs_mutex_lock(&dev->mutex); dev->stats.tx_packets += stats->n_packets; dev->stats.tx_bytes += stats->n_bytes; ovs_mutex_unlock(&dev->mutex); } } static int get_patch_config(const struct netdev *dev_, struct smap *args) { struct netdev_vport *dev = netdev_vport_cast(dev_); ovs_mutex_lock(&dev->mutex); if (dev->peer) { smap_add(args, "peer", dev->peer); } ovs_mutex_unlock(&dev->mutex); return 0; } static int set_patch_config(struct netdev *dev_, const struct smap *args) { struct netdev_vport *dev = netdev_vport_cast(dev_); const char *name = netdev_get_name(dev_); const char *peer; peer = smap_get(args, "peer"); if (!peer) { VLOG_ERR("%s: patch type requires valid 'peer' argument", name); return EINVAL; } if (smap_count(args) > 1) { VLOG_ERR("%s: patch type takes only a 'peer' argument", name); return EINVAL; } if (!strcmp(name, peer)) { VLOG_ERR("%s: patch peer must not be self", name); return EINVAL; } ovs_mutex_lock(&dev->mutex); if (!dev->peer || strcmp(dev->peer, peer)) { free(dev->peer); dev->peer = xstrdup(peer); netdev_change_seq_changed(dev_); } ovs_mutex_unlock(&dev->mutex); return 0; } static int get_stats(const struct netdev *netdev, struct netdev_stats *stats) { struct netdev_vport *dev = netdev_vport_cast(netdev); ovs_mutex_lock(&dev->mutex); *stats = dev->stats; ovs_mutex_unlock(&dev->mutex); return 0; } /* Tunnel push pop ops. */ static struct ip_header * ip_hdr(void *eth) { return (void *)((char *)eth + sizeof (struct eth_header)); } static struct ovs_16aligned_ip6_hdr * ipv6_hdr(void *eth) { return (void *)((char *)eth + sizeof (struct eth_header)); } static void * ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, unsigned int *hlen) { void *nh; struct ip_header *ip; struct ovs_16aligned_ip6_hdr *ip6; void *l4; int l3_size; nh = dp_packet_l3(packet); ip = nh; ip6 = nh; l4 = dp_packet_l4(packet); if (!nh || !l4) { return NULL; } *hlen = sizeof(struct eth_header); l3_size = dp_packet_size(packet) - ((char *)nh - (char *)dp_packet_data(packet)); if (IP_VER(ip->ip_ihl_ver) == 4) { ovs_be32 ip_src, ip_dst; if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) { VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum"); return NULL; } if (ntohs(ip->ip_tot_len) > l3_size) { VLOG_WARN_RL(&err_rl, "ip packet is truncated (IP length %d, actual %d)", ntohs(ip->ip_tot_len), l3_size); return NULL; } if (IP_IHL(ip->ip_ihl_ver) * 4 > sizeof(struct ip_header)) { VLOG_WARN_RL(&err_rl, "ip options not supported on tunnel packets " "(%d bytes)", IP_IHL(ip->ip_ihl_ver) * 4); return NULL; } ip_src = get_16aligned_be32(&ip->ip_src); ip_dst = get_16aligned_be32(&ip->ip_dst); tnl->ip_src = ip_src; tnl->ip_dst = ip_dst; tnl->ip_tos = ip->ip_tos; tnl->ip_ttl = ip->ip_ttl; *hlen += IP_HEADER_LEN; } else if (IP_VER(ip->ip_ihl_ver) == 6) { memcpy(tnl->ipv6_src.s6_addr, ip6->ip6_src.be16, sizeof ip6->ip6_src); memcpy(tnl->ipv6_dst.s6_addr, ip6->ip6_dst.be16, sizeof ip6->ip6_dst); tnl->ip_tos = 0; tnl->ip_ttl = ip6->ip6_hlim; *hlen += IPV6_HEADER_LEN; } else { VLOG_WARN_RL(&err_rl, "ipv4 packet has invalid version (%d)", IP_VER(ip->ip_ihl_ver)); return NULL; } return l4; } static bool is_header_ipv6(const void *header) { const struct eth_header *eth; eth = header; return eth->eth_type == htons(ETH_TYPE_IPV6); } /* Pushes the 'size' bytes of 'header' into the headroom of 'packet', * reallocating the packet if necessary. 'header' should contain an Ethernet * header, followed by an IPv4 header (without options), and an L4 header. * * This function sets the IP header's ip_tot_len field (which should be zeroed * as part of 'header') and puts its value into '*ip_tot_size' as well. Also * updates IP header checksum. * * Return pointer to the L4 header added to 'packet'. */ static void * push_ip_header(struct dp_packet *packet, const void *header, int size, int *ip_tot_size) { struct eth_header *eth; struct ip_header *ip; struct ovs_16aligned_ip6_hdr *ip6; eth = dp_packet_push_uninit(packet, size); *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header); memcpy(eth, header, size); if (is_header_ipv6(header)) { ip6 = ipv6_hdr(eth); *ip_tot_size -= IPV6_HEADER_LEN; ip6->ip6_plen = htons(*ip_tot_size); return ip6 + 1; } else { ip = ip_hdr(eth); ip->ip_tot_len = htons(*ip_tot_size); ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len); *ip_tot_size -= IP_HEADER_LEN; return ip + 1; } } static void * udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, unsigned int *hlen) { struct udp_header *udp; udp = ip_extract_tnl_md(packet, tnl, hlen); if (!udp) { return NULL; } if (udp->udp_csum) { uint32_t csum; if (is_header_ipv6(dp_packet_data(packet))) { csum = packet_csum_pseudoheader6(dp_packet_l3(packet)); } else { csum = packet_csum_pseudoheader(dp_packet_l3(packet)); } csum = csum_continue(csum, udp, dp_packet_size(packet) - ((const unsigned char *)udp - (const unsigned char *)dp_packet_l2(packet))); if (csum_finish(csum)) { return NULL; } tnl->flags |= FLOW_TNL_F_CSUM; } tnl->tp_src = udp->udp_src; tnl->tp_dst = udp->udp_dst; return udp + 1; } static ovs_be16 get_src_port(struct dp_packet *packet) { uint32_t hash; hash = dp_packet_get_rss_hash(packet); return htons((((uint64_t) hash * (tnl_udp_port_max - tnl_udp_port_min)) >> 32) + tnl_udp_port_min); } static void push_udp_header(struct dp_packet *packet, const struct ovs_action_push_tnl *data) { struct udp_header *udp; int ip_tot_size; udp = push_ip_header(packet, data->header, data->header_len, &ip_tot_size); /* set udp src port */ udp->udp_src = get_src_port(packet); udp->udp_len = htons(ip_tot_size); if (udp->udp_csum) { uint32_t csum; if (is_header_ipv6(dp_packet_data(packet))) { csum = packet_csum_pseudoheader6(ipv6_hdr(dp_packet_data(packet))); } else { csum = packet_csum_pseudoheader(ip_hdr(dp_packet_data(packet))); } csum = csum_continue(csum, udp, ip_tot_size); udp->udp_csum = csum_finish(csum); if (!udp->udp_csum) { udp->udp_csum = htons(0xffff); } } } static void * udp_build_header(struct netdev_tunnel_config *tnl_cfg, const struct flow *tnl_flow, struct ovs_action_push_tnl *data, unsigned int *hlen) { struct ip_header *ip; struct ovs_16aligned_ip6_hdr *ip6; struct udp_header *udp; bool is_ipv6; *hlen = sizeof(struct eth_header); is_ipv6 = is_header_ipv6(data->header); if (is_ipv6) { ip6 = ipv6_hdr(data->header); ip6->ip6_nxt = IPPROTO_UDP; udp = (struct udp_header *) (ip6 + 1); *hlen += IPV6_HEADER_LEN; } else { ip = ip_hdr(data->header); ip->ip_proto = IPPROTO_UDP; udp = (struct udp_header *) (ip + 1); *hlen += IP_HEADER_LEN; } udp->udp_dst = tnl_cfg->dst_port; if (is_ipv6 || tnl_flow->tunnel.flags & FLOW_TNL_F_CSUM) { /* Write a value in now to mark that we should compute the checksum * later. 0xffff is handy because it is transparent to the * calculation. */ udp->udp_csum = htons(0xffff); } return udp + 1; } static int gre_header_len(ovs_be16 flags) { int hlen = 4; if (flags & htons(GRE_CSUM)) { hlen += 4; } if (flags & htons(GRE_KEY)) { hlen += 4; } if (flags & htons(GRE_SEQ)) { hlen += 4; } return hlen; } static int parse_gre_header(struct dp_packet *packet, struct flow_tnl *tnl) { const struct gre_base_hdr *greh; ovs_16aligned_be32 *options; int hlen; unsigned int ulen; greh = ip_extract_tnl_md(packet, tnl, &ulen); if (!greh) { return -EINVAL; } if (greh->flags & ~(htons(GRE_CSUM | GRE_KEY | GRE_SEQ))) { return -EINVAL; } if (greh->protocol != htons(ETH_TYPE_TEB)) { return -EINVAL; } hlen = ulen + gre_header_len(greh->flags); if (hlen > dp_packet_size(packet)) { return -EINVAL; } options = (ovs_16aligned_be32 *)(greh + 1); if (greh->flags & htons(GRE_CSUM)) { ovs_be16 pkt_csum; pkt_csum = csum(greh, dp_packet_size(packet) - ((const unsigned char *)greh - (const unsigned char *)dp_packet_l2(packet))); if (pkt_csum) { return -EINVAL; } tnl->flags = FLOW_TNL_F_CSUM; options++; } if (greh->flags & htons(GRE_KEY)) { tnl->tun_id = (OVS_FORCE ovs_be64) ((OVS_FORCE uint64_t)(get_16aligned_be32(options)) << 32); tnl->flags |= FLOW_TNL_F_KEY; options++; } if (greh->flags & htons(GRE_SEQ)) { options++; } return hlen; } static void pkt_metadata_init_tnl(struct pkt_metadata *md) { /* Zero up through the tunnel metadata options. The length and table * are before this and as long as they are empty, the options won't * be looked at. */ memset(md, 0, offsetof(struct pkt_metadata, tunnel.metadata.opts)); } static int netdev_gre_pop_header(struct dp_packet *packet) { struct pkt_metadata *md = &packet->md; struct flow_tnl *tnl = &md->tunnel; int hlen = sizeof(struct eth_header) + 4; hlen += is_header_ipv6(dp_packet_data(packet)) ? IPV6_HEADER_LEN : IP_HEADER_LEN; pkt_metadata_init_tnl(md); if (hlen > dp_packet_size(packet)) { return EINVAL; } hlen = parse_gre_header(packet, tnl); if (hlen < 0) { return -hlen; } dp_packet_reset_packet(packet, hlen); return 0; } static void netdev_gre_push_header(struct dp_packet *packet, const struct ovs_action_push_tnl *data) { struct gre_base_hdr *greh; int ip_tot_size; greh = push_ip_header(packet, data->header, data->header_len, &ip_tot_size); if (greh->flags & htons(GRE_CSUM)) { ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1); *csum_opt = csum(greh, ip_tot_size); } } static int netdev_gre_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct flow *tnl_flow) { struct netdev_vport *dev = netdev_vport_cast(netdev); struct netdev_tunnel_config *tnl_cfg; struct ip_header *ip; struct ovs_16aligned_ip6_hdr *ip6; struct gre_base_hdr *greh; ovs_16aligned_be32 *options; int hlen; bool is_ipv6; is_ipv6 = is_header_ipv6(data->header); /* XXX: RCUfy tnl_cfg. */ ovs_mutex_lock(&dev->mutex); tnl_cfg = &dev->tnl_cfg; if (is_ipv6) { ip6 = ipv6_hdr(data->header); ip6->ip6_nxt = IPPROTO_GRE; greh = (struct gre_base_hdr *) (ip6 + 1); } else { ip = ip_hdr(data->header); ip->ip_proto = IPPROTO_GRE; greh = (struct gre_base_hdr *) (ip + 1); } greh->protocol = htons(ETH_TYPE_TEB); greh->flags = 0; options = (ovs_16aligned_be32 *) (greh + 1); if (tnl_flow->tunnel.flags & FLOW_TNL_F_CSUM) { greh->flags |= htons(GRE_CSUM); put_16aligned_be32(options, 0); options++; } if (tnl_cfg->out_key_present) { greh->flags |= htons(GRE_KEY); put_16aligned_be32(options, (OVS_FORCE ovs_be32) ((OVS_FORCE uint64_t) tnl_flow->tunnel.tun_id >> 32)); options++; } ovs_mutex_unlock(&dev->mutex); hlen = (uint8_t *) options - (uint8_t *) greh; data->header_len = sizeof(struct eth_header) + hlen + (is_ipv6 ? IPV6_HEADER_LEN : IP_HEADER_LEN); data->tnl_type = OVS_VPORT_TYPE_GRE; return 0; } static int netdev_vxlan_pop_header(struct dp_packet *packet) { struct pkt_metadata *md = &packet->md; struct flow_tnl *tnl = &md->tunnel; struct vxlanhdr *vxh; unsigned int hlen; pkt_metadata_init_tnl(md); if (VXLAN_HLEN > dp_packet_l4_size(packet)) { return EINVAL; } vxh = udp_extract_tnl_md(packet, tnl, &hlen); if (!vxh) { return EINVAL; } if (get_16aligned_be32(&vxh->vx_flags) != htonl(VXLAN_FLAGS) || (get_16aligned_be32(&vxh->vx_vni) & htonl(0xff))) { VLOG_WARN_RL(&err_rl, "invalid vxlan flags=%#x vni=%#x\n", ntohl(get_16aligned_be32(&vxh->vx_flags)), ntohl(get_16aligned_be32(&vxh->vx_vni))); return EINVAL; } tnl->tun_id = htonll(ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8); tnl->flags |= FLOW_TNL_F_KEY; dp_packet_reset_packet(packet, hlen + VXLAN_HLEN); return 0; } static int netdev_vxlan_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct flow *tnl_flow) { struct netdev_vport *dev = netdev_vport_cast(netdev); struct netdev_tunnel_config *tnl_cfg; struct vxlanhdr *vxh; unsigned int hlen; /* XXX: RCUfy tnl_cfg. */ ovs_mutex_lock(&dev->mutex); tnl_cfg = &dev->tnl_cfg; vxh = udp_build_header(tnl_cfg, tnl_flow, data, &hlen); put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS)); put_16aligned_be32(&vxh->vx_vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8)); ovs_mutex_unlock(&dev->mutex); data->header_len = hlen + VXLAN_HLEN; data->tnl_type = OVS_VPORT_TYPE_VXLAN; return 0; } static int netdev_geneve_pop_header(struct dp_packet *packet) { struct pkt_metadata *md = &packet->md; struct flow_tnl *tnl = &md->tunnel; struct genevehdr *gnh; unsigned int hlen, opts_len, ulen; pkt_metadata_init_tnl(md); if (GENEVE_BASE_HLEN > dp_packet_l4_size(packet)) { VLOG_WARN_RL(&err_rl, "geneve packet too small: min header=%u packet size=%"PRIuSIZE"\n", (unsigned int)GENEVE_BASE_HLEN, dp_packet_l4_size(packet)); return EINVAL; } gnh = udp_extract_tnl_md(packet, tnl, &ulen); if (!gnh) { return EINVAL; } opts_len = gnh->opt_len * 4; hlen = ulen + GENEVE_BASE_HLEN + opts_len; if (hlen > dp_packet_size(packet)) { VLOG_WARN_RL(&err_rl, "geneve packet too small: header len=%u packet size=%u\n", hlen, dp_packet_size(packet)); return EINVAL; } if (gnh->ver != 0) { VLOG_WARN_RL(&err_rl, "unknown geneve version: %"PRIu8"\n", gnh->ver); return EINVAL; } if (gnh->proto_type != htons(ETH_TYPE_TEB)) { VLOG_WARN_RL(&err_rl, "unknown geneve encapsulated protocol: %#x\n", ntohs(gnh->proto_type)); return EINVAL; } tnl->flags |= gnh->oam ? FLOW_TNL_F_OAM : 0; tnl->tun_id = htonll(ntohl(get_16aligned_be32(&gnh->vni)) >> 8); tnl->flags |= FLOW_TNL_F_KEY; memcpy(tnl->metadata.opts.gnv, gnh->options, opts_len); tnl->metadata.present.len = opts_len; tnl->flags |= FLOW_TNL_F_UDPIF; dp_packet_reset_packet(packet, hlen); return 0; } static int netdev_geneve_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct flow *tnl_flow) { struct netdev_vport *dev = netdev_vport_cast(netdev); struct netdev_tunnel_config *tnl_cfg; struct genevehdr *gnh; int opt_len; bool crit_opt; unsigned int hlen; /* XXX: RCUfy tnl_cfg. */ ovs_mutex_lock(&dev->mutex); tnl_cfg = &dev->tnl_cfg; gnh = udp_build_header(tnl_cfg, tnl_flow, data, &hlen); put_16aligned_be32(&gnh->vni, htonl(ntohll(tnl_flow->tunnel.tun_id) << 8)); ovs_mutex_unlock(&dev->mutex); opt_len = tun_metadata_to_geneve_header(&tnl_flow->tunnel, gnh->options, &crit_opt); gnh->opt_len = opt_len / 4; gnh->oam = !!(tnl_flow->tunnel.flags & FLOW_TNL_F_OAM); gnh->critical = crit_opt ? 1 : 0; gnh->proto_type = htons(ETH_TYPE_TEB); data->header_len = hlen + GENEVE_BASE_HLEN + opt_len; data->tnl_type = OVS_VPORT_TYPE_GENEVE; return 0; } static void netdev_vport_range(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) { int val1, val2; if (argc < 3) { struct ds ds = DS_EMPTY_INITIALIZER; ds_put_format(&ds, "Tunnel UDP source port range: %"PRIu16"-%"PRIu16"\n", tnl_udp_port_min, tnl_udp_port_max); unixctl_command_reply(conn, ds_cstr(&ds)); ds_destroy(&ds); return; } if (argc != 3) { return; } val1 = atoi(argv[1]); if (val1 <= 0 || val1 > UINT16_MAX) { unixctl_command_reply(conn, "Invalid min."); return; } val2 = atoi(argv[2]); if (val2 <= 0 || val2 > UINT16_MAX) { unixctl_command_reply(conn, "Invalid max."); return; } if (val1 > val2) { tnl_udp_port_min = val2; tnl_udp_port_max = val1; } else { tnl_udp_port_min = val1; tnl_udp_port_max = val2; } seq_change(tnl_conf_seq); unixctl_command_reply(conn, "OK"); } #define VPORT_FUNCTIONS(GET_CONFIG, SET_CONFIG, \ GET_TUNNEL_CONFIG, GET_STATUS, \ BUILD_HEADER, \ PUSH_HEADER, POP_HEADER) \ NULL, \ netdev_vport_run, \ netdev_vport_wait, \ \ netdev_vport_alloc, \ netdev_vport_construct, \ netdev_vport_destruct, \ netdev_vport_dealloc, \ GET_CONFIG, \ SET_CONFIG, \ GET_TUNNEL_CONFIG, \ BUILD_HEADER, \ PUSH_HEADER, \ POP_HEADER, \ NULL, /* get_numa_id */ \ NULL, /* set_multiq */ \ \ NULL, /* send */ \ NULL, /* send_wait */ \ \ netdev_vport_set_etheraddr, \ netdev_vport_get_etheraddr, \ NULL, /* get_mtu */ \ NULL, /* set_mtu */ \ NULL, /* get_ifindex */ \ NULL, /* get_carrier */ \ NULL, /* get_carrier_resets */ \ NULL, /* get_miimon */ \ get_stats, \ \ NULL, /* get_features */ \ NULL, /* set_advertisements */ \ \ NULL, /* set_policing */ \ NULL, /* get_qos_types */ \ NULL, /* get_qos_capabilities */ \ NULL, /* get_qos */ \ NULL, /* set_qos */ \ NULL, /* get_queue */ \ NULL, /* set_queue */ \ NULL, /* delete_queue */ \ NULL, /* get_queue_stats */ \ NULL, /* queue_dump_start */ \ NULL, /* queue_dump_next */ \ NULL, /* queue_dump_done */ \ NULL, /* dump_queue_stats */ \ \ NULL, /* set_in4 */ \ NULL, /* get_addr_list */ \ NULL, /* add_router */ \ NULL, /* get_next_hop */ \ GET_STATUS, \ NULL, /* arp_lookup */ \ \ netdev_vport_update_flags, \ \ NULL, /* rx_alloc */ \ NULL, /* rx_construct */ \ NULL, /* rx_destruct */ \ NULL, /* rx_dealloc */ \ NULL, /* rx_recv */ \ NULL, /* rx_wait */ \ NULL, /* rx_drain */ #define TUNNEL_CLASS(NAME, DPIF_PORT, BUILD_HEADER, PUSH_HEADER, POP_HEADER) \ { DPIF_PORT, \ { NAME, false, \ VPORT_FUNCTIONS(get_tunnel_config, \ set_tunnel_config, \ get_netdev_tunnel_config, \ tunnel_get_status, \ BUILD_HEADER, PUSH_HEADER, POP_HEADER) }} void netdev_vport_tunnel_register(void) { /* The name of the dpif_port should be short enough to accomodate adding * a port number to the end if one is necessary. */ static const struct vport_class vport_classes[] = { TUNNEL_CLASS("geneve", "genev_sys", netdev_geneve_build_header, push_udp_header, netdev_geneve_pop_header), TUNNEL_CLASS("gre", "gre_sys", netdev_gre_build_header, netdev_gre_push_header, netdev_gre_pop_header), TUNNEL_CLASS("ipsec_gre", "gre_sys", NULL, NULL, NULL), TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header, push_udp_header, netdev_vxlan_pop_header), TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL), TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL), }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; if (ovsthread_once_start(&once)) { int i; for (i = 0; i < ARRAY_SIZE(vport_classes); i++) { netdev_register_provider(&vport_classes[i].netdev_class); } unixctl_command_register("tnl/egress_port_range", "min max", 0, 2, netdev_vport_range, NULL); ovsthread_once_done(&once); } } void netdev_vport_patch_register(void) { static const struct vport_class patch_class = { NULL, { "patch", false, VPORT_FUNCTIONS(get_patch_config, set_patch_config, NULL, NULL, NULL, NULL, NULL) }}; netdev_register_provider(&patch_class.netdev_class); }