diff options
61 files changed, 7597 insertions, 2426 deletions
@@ -156,7 +156,7 @@ A: The following table lists the Linux kernel versions against which the | 2.1.x | 2.6.32 to 3.11 | 2.3.x | 2.6.32 to 3.14 | 2.4.x | 2.6.32 to 4.0 -| 2.5.x | 2.6.32 to 4.2 +| 2.5.x | 2.6.32 to 4.3 Open vSwitch userspace should also work with the Linux kernel module built into Linux 3.3 and later. diff --git a/acinclude.m4 b/acinclude.m4 index e4846d90a..7c8afaca6 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -134,10 +134,10 @@ AC_DEFUN([OVS_CHECK_LINUX], [ AC_MSG_RESULT([$kversion]) if test "$version" -ge 4; then - if test "$version" = 4 && test "$patchlevel" -le 2; then + if test "$version" = 4 && test "$patchlevel" -le 3; then : # Linux 4.x else - AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.2.x is not supported (please refer to the FAQ for advice)]) + AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.3.x is not supported (please refer to the FAQ for advice)]) fi elif test "$version" = 3; then : # Linux 3.x @@ -313,15 +313,28 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ mkdir -p datapath/linux : > datapath/linux/kcompat.h.new + echo '#include <linux/version.h> +#ifndef RHEL_RELEASE_CODE +#define RHEL_RELEASE_CODE 0 +#define RHEL_RELEASE_VERSION(a, b) 0 +#endif' >> datapath/linux/kcompat.h.new + OVS_GREP_IFELSE([$KSRC/arch/x86/include/asm/checksum_32.h], [src_err,], [OVS_DEFINE([HAVE_CSUM_COPY_DBG])]) + OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_dst_lookup.*net], + [OVS_DEFINE([HAVE_IPV6_DST_LOOKUP_NET])]) + OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_stub]) + OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [ERR_CAST]) OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [IS_ERR_OR_NULL]) OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [eth_hw_addr_random]) OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [ether_addr_copy]) + OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [IFLA_GENEVE_TOS]) + OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [rtnl_link_stats64]) + OVS_GREP_IFELSE([$KSRC/include/linux/if_link.h], [rtnl_link_stats64]) OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_set_encap_proto]) OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_hwaccel_push_inside]) @@ -329,9 +342,13 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/in.h], [proto_ports_offset]) OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [__ip_select_ident.*dst_entry], [OVS_DEFINE([HAVE_IP_SELECT_IDENT_USING_DST_ENTRY])]) + OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [__ip_select_ident.*net], + [OVS_DEFINE([HAVE_IP_SELECT_IDENT_USING_NET])]) + OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [inet_get_local_port_range.*net], [OVS_DEFINE([HAVE_INET_GET_LOCAL_PORT_RANGE_USING_NET])]) OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [ip_is_fragment]) + OVS_GREP_IFELSE([$KSRC/include/net/dst_metadata.h], [metadata_dst]) OVS_GREP_IFELSE([$KSRC/include/linux/net.h], [sock_create_kern.*net], [OVS_DEFINE([HAVE_SOCK_CREATE_KERN_NET])]) @@ -340,17 +357,33 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [dev_get_by_index_rcu]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [__skb_gso_segment]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [can_checksum_protocol]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [ndo_get_iflink]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_features_t]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [pcpu_sw_netstats]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_rx_handler_register]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [net_device_extended]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [rx_handler_func_t.*pskb], [OVS_DEFINE([HAVE_RX_HANDLER_PSKB])]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netif_needs_gso.*net_device], + [OVS_DEFINE([HAVE_NETIF_NEEDS_GSO_NETDEV])]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [udp_offload]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [udp_offload.*uoff], + [OVS_DEFINE([HAVE_UDP_OFFLOAD_ARG_UOFF])]) + OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [gro_remcsum]) + + OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_hook_state]) + OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_register_net_hook]) OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_hookfn.*nf_hook_ops], [OVS_DEFINE([HAVE_NF_HOOKFN_ARG_OPS])]) OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32]) + OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [get_link_net]) + OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [name_assign_type]) + OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [rtnl_create_link.*src_net], + [OVS_DEFINE([HAVE_RTNL_CREATE_LINK_SRC_NET])]) + OVS_GREP_IFELSE([$KSRC/include/net/net_namespace.h], [possible_net_t]) + OVS_GREP_IFELSE([$KSRC/include/linux/rcupdate.h], [rcu_read_lock_held], [], [OVS_GREP_IFELSE([$KSRC/include/linux/rtnetlink.h], [rcu_read_lock_held])]) @@ -364,7 +397,12 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [[[^@]]proto_data_valid], [OVS_DEFINE([HAVE_PROTO_DATA_VALID])]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_checksum_start_offset]) + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_protocol]) + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_mac_header]) + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_network_header]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [kfree_skb_list]) + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_scrub_packet.*xnet], + [OVS_DEFINE([HAVE_SKB_SCRUB_PACKET_XNET])]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [rxhash]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [u16.*rxhash], [OVS_DEFINE([HAVE_U16_RXHASH])]) @@ -423,7 +461,9 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/net/geneve.h], [geneve_hdr]) OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [gre_cisco_register]) + OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [gre_handle_offloads]) OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [IP6_FH_F_SKIP_RH]) + OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [ip6_local_out_sk]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_get_be16]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be16]) OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be32]) @@ -438,7 +478,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ [OVS_DEFINE([HAVE_VLAN_BUG_WORKAROUND])]) OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_insert_tag_set_proto]) OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [__vlan_insert_tag]) - + OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_get_protocol]) OVS_GREP_IFELSE([$KSRC/include/linux/u64_stats_sync.h], [u64_stats_fetch_begin_irq]) @@ -446,17 +486,18 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ [OVS_DEFINE([HAVE_RHEL_OVS_HOOK])]) OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [struct vxlan_metadata], [OVS_DEFINE([HAVE_VXLAN_METADATA])]) + OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [VXLAN_HF_RCO]) OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_flow_src_port], [OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [inet_get_local_port_range(net], [OVS_DEFINE([HAVE_UDP_FLOW_SRC_PORT])])]) OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_v4_check]) OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_set_csum]) - OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [ignore_df:1], + OVS_GREP_IFELSE([$KSRC/include/net/udp_tunnel.h], [udp_tunnel_gro_complete]) + OVS_GREP_IFELSE([$KSRC/include/net/udp_tunnel.h], [ipv6_v6only], + [OVS_DEFINE([HAVE_UDP_TUNNEL_IPV6])]) + + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [ignore_df], [OVS_DEFINE([HAVE_IGNORE_DF_RENAME])]) - OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [SKB_GSO_GRE_CSUM], - [OVS_DEFINE([HAVE_SKB_GSO_GRE_CSUM])]) - OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [SKB_GSO_UDP_TUNNEL_CSUM], - [OVS_DEFINE([HAVE_SKB_GSO_UDP_TUNNEL_CSUM])]) OVS_GREP_IFELSE([$KSRC/include/uapi/linux/netdevice.h], [NET_NAME_UNKNOWN], [OVS_DEFINE([HAVE_NET_NAME_UNKNOWN])]) diff --git a/datapath/Modules.mk b/datapath/Modules.mk index 8dc3415c6..c06eafc1e 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -42,8 +42,7 @@ openvswitch_headers = \ vlan.h \ vport.h \ vport-internal_dev.h \ - vport-netdev.h \ - vport-vxlan.h + vport-netdev.h openvswitch_extras = \ README.md diff --git a/datapath/actions.c b/datapath/actions.c index c529bbb9b..f45f61998 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -618,12 +618,11 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) else kfree_skb(skb); } - static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; + struct ip_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; @@ -650,11 +649,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; + upcall.egress_tun_info = &info; err = ovs_vport_get_egress_tun_info(vport, skb, - &info); - if (!err) - upcall.egress_tun_info = &info; + &upcall); + if (err) + upcall.egress_tun_info = NULL; } + break; } @@ -748,7 +749,11 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + ovs_skb_dst_drop(skb); + ovs_dst_hold((struct dst_entry *)tun->tun_dst); + ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); return 0; } diff --git a/datapath/compat.h b/datapath/compat.h index c827b11aa..a30003f8b 100644 --- a/datapath/compat.h +++ b/datapath/compat.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -43,41 +43,7 @@ #define inet_sport(sk) (inet_sk(sk)->inet_sport) #endif -static inline struct rtable *find_route(struct net *net, - __be32 *saddr, __be32 daddr, - u8 ipproto, u8 tos, u32 skb_mark) -{ - struct rtable *rt; - /* Tunnel configuration keeps DSCP part of TOS bits, But Linux - * router expect RT_TOS bits only. - */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) - struct flowi fl = { .nl_u = { .ip4_u = { - .daddr = daddr, - .saddr = *saddr, - .tos = RT_TOS(tos) } }, - .mark = skb_mark, - .proto = ipproto }; - - if (unlikely(ip_route_output_key(net, &rt, &fl))) - return ERR_PTR(-EADDRNOTAVAIL); - *saddr = fl.nl_u.ip4_u.saddr; - return rt; -#else - struct flowi4 fl = { .daddr = daddr, - .saddr = *saddr, - .flowi4_tos = RT_TOS(tos), - .flowi4_mark = skb_mark, - .flowi4_proto = ipproto }; - - rt = ip_route_output_key(net, &fl); - *saddr = fl.saddr; - return rt; -#endif -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) static inline bool skb_encapsulation(struct sk_buff *skb) { return skb->encapsulation; diff --git a/datapath/datapath.c b/datapath/datapath.c index 5f362425e..32561a3ce 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -56,6 +56,7 @@ #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" +#include "gso.h" #include "vlan.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -178,7 +179,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -190,7 +191,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -480,10 +481,12 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); + if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info); + upcall_info->egress_tun_info, + upcall_info->egress_tun_opts); BUG_ON(err); nla_nest_end(user_skb, nla); } @@ -590,7 +593,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -607,6 +609,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!input_vport) goto err_unlock; + packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); @@ -1028,7 +1031,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1040,7 +1043,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1167,7 +1170,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1175,7 +1178,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } @@ -1810,7 +1813,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport))) + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); @@ -2228,13 +2232,11 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } diff --git a/datapath/datapath.h b/datapath/datapath.h index aca9407a4..aefac6d25 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -26,12 +26,12 @@ #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> #include <net/net_namespace.h> +#include <net/ip_tunnels.h> #include "compat.h" #include "flow.h" #include "flow_table.h" #include "vlan.h" -#include "vport.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -95,13 +95,10 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @egress_tun_info: Tunnel information about this packet on egress path. - * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -117,7 +114,8 @@ struct ovs_skb_cb { * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; + const void *egress_tun_opts; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; @@ -129,12 +127,10 @@ struct dp_upcall_info { * struct ovs_net - Per net-namespace data for ovs. * @dps: List of datapaths to enable dumping them all out. * Protected by genl_mutex. - * @vport_net: Per network namespace data for vport. */ struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; }; extern int ovs_net_id; diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c index f9a037510..9434c19c7 100644 --- a/datapath/dp_notify.c +++ b/datapath/dp_notify.c @@ -60,13 +60,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(ovs_netdev_get_vport(netdev_vport->dev))) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/datapath/flow.c b/datapath/flow.c index 8ef60d134..3375d7b4d 100644 --- a/datapath/flow.c +++ b/datapath/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -48,7 +48,7 @@ #include "datapath.h" #include "flow.h" #include "flow_netlink.h" - +#include "vport.h" #include "vlan.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) @@ -684,19 +684,21 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + if (ip_tunnel_info_af(tun_info) != AF_INET) + return -EINVAL; + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); BUILD_BUG_ON(((1 << (sizeof(tun_info->options_len) * 8)) - 1) > sizeof(key->tun_opts)); - if (tun_info->options) { - memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info->options, tun_info->options_len); + if (tun_info->options_len) { + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); key->tun_opts_len = tun_info->options_len; } else { key->tun_opts_len = 0; diff --git a/datapath/flow.h b/datapath/flow.h index 2433436d8..1abb2e15a 100644 --- a/datapath/flow.h +++ b/datapath/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -32,31 +32,11 @@ #include <linux/time.h> #include <linux/flex_array.h> #include <net/inet_ecn.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,54 +46,9 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ @@ -122,7 +57,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -273,7 +208,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); /* Update the non-metadata part of the flow key using skb. */ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c index 3a3492beb..f95aa1436 100644 --- a/datapath/flow_netlink.c +++ b/datapath/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -45,11 +45,12 @@ #include <net/ipv6.h> #include <net/ndisc.h> #include <net/mpls.h> +#include <net/vxlan.h> #include "datapath.h" #include "flow.h" #include "flow_netlink.h" -#include "vport-vxlan.h" +#include "gso.h" struct ovs_len_tbl { int len; @@ -485,7 +486,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct nlattr *a; int rem; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; + struct vxlan_metadata opts; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); @@ -568,19 +569,19 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + SW_FLOW_KEY_PUT(match, tun_key.tos, nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + SW_FLOW_KEY_PUT(match, tun_key.ttl, nla_get_u8(a), is_mask); ttl = true; break; @@ -643,7 +644,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, } if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { + if (!match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } @@ -660,7 +661,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); @@ -675,22 +676,24 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, } static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->ipv4_src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) return -EMSGSIZE; - if (output->ipv4_dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) @@ -712,8 +715,8 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_VXLAN_OPT && - vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) + else if (output->tun_flags & TUNNEL_VXLAN_OPT && + vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; } @@ -721,7 +724,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, } static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { struct nlattr *nla; @@ -740,10 +743,11 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info, + const void *egress_tun_opts) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, - egress_tun_info->options, + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, + egress_tun_opts, egress_tun_info->options_len); } @@ -860,7 +864,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); } - if (attrs & (1ULL << OVS_KEY_ATTR_IPV4)) { + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { const struct ovs_key_ipv4 *ipv4_key; ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); @@ -881,7 +885,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv4_key->ipv4_src, is_mask); SW_FLOW_KEY_PUT(match, ipv4.addr.dst, ipv4_key->ipv4_dst, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); } if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) { @@ -1152,7 +1156,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.ipv4_dst) + if (match->key->tun_key.u.ipv4.dst) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1324,7 +1328,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask)) { + if ((swkey->tun_key.u.ipv4.dst || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) @@ -1585,20 +1589,49 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } -/* RCU callback used by ovs_nla_free_flow_actions. */ -static void rcu_free_acts_callback(struct rcu_head *rcu) +static void ovs_nla_free_set_action(const struct nlattr *a) { - struct sw_flow_actions *sf_acts = container_of(rcu, - struct sw_flow_actions, rcu); + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + ovs_dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + } + } + kfree(sf_acts); } +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) { - call_rcu(&sf_acts->rcu, rcu_free_acts_callback); + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1794,10 +1827,11 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; - int start, opts_type; - int err = 0; + int err = 0, start, opts_type; ovs_match_init(&match, &key, NULL); opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); @@ -1820,27 +1854,31 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + ovs_dst_release((struct dst_entry *)tun_dst); return PTR_ERR(a); + } - tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; - tun_info->options_len = key.tun_opts_len; + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; - if (tun_info->options_len) { - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - memcpy((tun_info + 1), - TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); - tun_info->options = (tun_info + 1); - } else { - tun_info->options = NULL; - } + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->key = key.tun_key; + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); add_nested_action_end(*sfa, start); return err; @@ -2225,7 +2263,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2275,15 +2313,16 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + err = ipv4_tun_to_nlattr(skb, &tun_info->key, tun_info->options_len ? - tun_info->options : NULL, + ip_tunnel_info_opts(tun_info) : NULL, tun_info->options_len); if (err) return err; diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h index 5c3d75bff..140bbe707 100644 --- a/datapath/flow_netlink.h +++ b/datapath/flow_netlink.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -54,8 +54,9 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); -int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, + const struct ip_tunnel_info *egress_tun_info, + const void *egress_tun_opts); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, @@ -69,5 +70,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/datapath/flow_table.c b/datapath/flow_table.c index eeadf8600..b51be69e8 100644 --- a/datapath/flow_table.c +++ b/datapath/flow_table.c @@ -45,6 +45,7 @@ #include <net/ndisc.h> #include "vlan.h" +#include "flow_netlink.h" #define TBL_MIN_BUCKETS 1024 #define MASK_ARRAY_SIZE_MIN 16 @@ -151,7 +152,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree(rcu_dereference_raw(flow->sf_acts)); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, @@ -505,7 +507,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.ipv4_dst) + if (key->tun_key.u.ipv4.dst) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index 96c3d55d7..7e66e14a5 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -7,7 +7,10 @@ openvswitch_sources += \ linux/compat/gre.c \ linux/compat/gso.c \ linux/compat/genetlink-openvswitch.c \ + linux/compat/ip_gre.c \ + linux/compat/ip_tunnel.c \ linux/compat/ip_tunnels_core.c \ + linux/compat/lisp.c \ linux/compat/netdevice.c \ linux/compat/net_namespace.c \ linux/compat/reciprocal_div.c \ @@ -33,6 +36,7 @@ openvswitch_headers += \ linux/compat/include/linux/if.h \ linux/compat/include/linux/if_arp.h \ linux/compat/include/linux/if_ether.h \ + linux/compat/include/linux/if_link.h \ linux/compat/include/linux/if_vlan.h \ linux/compat/include/linux/in.h \ linux/compat/include/linux/ip.h \ @@ -40,6 +44,7 @@ openvswitch_headers += \ linux/compat/include/linux/jiffies.h \ linux/compat/include/linux/kconfig.h \ linux/compat/include/linux/kernel.h \ + linux/compat/include/net/lisp.h \ linux/compat/include/linux/list.h \ linux/compat/include/linux/mpls.h \ linux/compat/include/linux/net.h \ @@ -63,17 +68,23 @@ openvswitch_headers += \ linux/compat/include/linux/workqueue.h \ linux/compat/include/net/checksum.h \ linux/compat/include/net/dst.h \ + linux/compat/include/net/dst_metadata.h \ linux/compat/include/net/flow_keys.h \ linux/compat/include/net/genetlink.h \ linux/compat/include/net/geneve.h \ linux/compat/include/net/gre.h \ + linux/compat/include/net/inet_ecn.h \ linux/compat/include/net/inet_frag.h \ linux/compat/include/net/ip.h \ linux/compat/include/net/ip_tunnels.h \ + linux/compat/include/net/ip6_route.h \ + linux/compat/include/net/ip6_tunnel.h \ linux/compat/include/net/ipv6.h \ linux/compat/include/net/mpls.h \ linux/compat/include/net/net_namespace.h \ linux/compat/include/net/netlink.h \ + linux/compat/include/net/route.h \ + linux/compat/include/net/rtnetlink.h \ linux/compat/include/net/udp.h \ linux/compat/include/net/udp_tunnel.h \ linux/compat/include/net/sock.h \ diff --git a/datapath/linux/compat/dev-openvswitch.c b/datapath/linux/compat/dev-openvswitch.c index 38ec8fe9e..d7d4224a1 100644 --- a/datapath/linux/compat/dev-openvswitch.c +++ b/datapath/linux/compat/dev-openvswitch.c @@ -1,6 +1,7 @@ #include <linux/if_bridge.h> #include <linux/netdevice.h> #include <linux/version.h> +#include <net/rtnetlink.h> #ifndef HAVE_DEV_DISABLE_LRO @@ -93,3 +94,24 @@ void rpl_netdev_rx_handler_unregister(struct net_device *dev) EXPORT_SYMBOL_GPL(rpl_netdev_rx_handler_unregister); #endif + +int rpl_rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34) + ops->dellink(dev); +#else + { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } +#endif + return 0; +} diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c index 85cf95f4c..297593ce6 100644 --- a/datapath/linux/compat/geneve.c +++ b/datapath/linux/compat/geneve.c @@ -1,122 +1,235 @@ /* - * Geneve: Generic Network Virtualization Encapsulation + * GENEVE: Generic Network Virtualization Encapsulation * - * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2015 Red Hat, Inc. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/version.h> -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) - #include <linux/kernel.h> -#include <linux/types.h> #include <linux/module.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/list.h> #include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/igmp.h> #include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <linux/ethtool.h> -#include <linux/mutex.h> -#include <net/arp.h> -#include <net/ndisc.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> +#include <linux/hash.h> +#include <linux/if_link.h> + +#include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/rtnetlink.h> #include <net/geneve.h> #include <net/protocol.h> -#include <net/udp_tunnel.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/ip6_checksum.h> -#endif -#include "compat.h" #include "gso.h" +#include "vport-netdev.h" +#include "compat.h" + +#ifndef HAVE_METADATA_DST +#define GENEVE_NETDEV_VER "0.6" + +#define GENEVE_UDP_PORT 6081 + +#define GENEVE_N_VID (1u << 24) +#define GENEVE_VID_MASK (GENEVE_N_VID - 1) -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) + +#define GENEVE_VER 0 +#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) + +/* per-network namespace private data for this module */ +struct geneve_net { + struct list_head geneve_list; + struct list_head sock_list; +}; + +static int geneve_net_id; + +/* Pseudo network device */ +struct geneve_dev { + struct hlist_node hlist; /* vni hash table */ + struct net *net; /* netns for packet i/o */ + struct net_device *dev; /* netdev for geneve tunnel */ + struct geneve_sock *sock; /* socket used for geneve tunnel */ + u8 vni[3]; /* virtual network ID for tunnel */ + u8 ttl; /* TTL override */ + u8 tos; /* TOS override */ + struct sockaddr_in remote; /* IPv4 address for link partner */ + struct list_head next; /* geneve's per namespace list */ + __be16 dst_port; + bool collect_md; +}; + +struct geneve_sock { + bool collect_md; + struct list_head list; + struct socket *sock; + struct rcu_head rcu; + int refcnt; +#ifdef HAVE_UDP_OFFLOAD + struct udp_offload udp_offloads; +#endif + struct hlist_head vni_list[VNI_HASH_SIZE]; +}; + +static inline __u32 geneve_net_vni_hash(u8 vni[3]) { - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; + __u32 vnid; - memcpy(geneveh->options, options, options_len); + vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2]; + return hash_32(vnid, VNI_HASH_BITS); } -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) +static __be64 vni_to_tunnel_id(const __u8 *vni) { - struct genevehdr *gnvh; - int min_headroom; +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, + __be32 addr, u8 vni[]) +{ + struct hlist_head *vni_list_head; + struct geneve_dev *geneve; + __u32 hash; + + /* Find the device for this VNI */ + hash = geneve_net_vni_hash(vni); + vni_list_head = &gs->vni_list[hash]; + hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { + if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && + addr == geneve->remote.sin_addr.s_addr) + return geneve; + } + return NULL; +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* geneve receive/decap routine */ +static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct genevehdr *gnvh = geneve_hdr(skb); + struct metadata_dst *tun_dst; + struct geneve_dev *geneve = NULL; +#ifdef HAVE_DEV_TSTATS + struct pcpu_sw_netstats *stats; +#endif + struct iphdr *iph; + u8 *vni; + __be32 addr; int err; + union { + struct metadata_dst dst; + char buf[sizeof(struct metadata_dst) + 256]; + } buf; - min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + iph = ip_hdr(skb); /* outer IP header... */ - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; + if (gs->collect_md) { + static u8 zero_vni[3]; + + vni = zero_vni; + addr = 0; + } else { + vni = gnvh->vni; + addr = iph->saddr; } - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; + geneve = geneve_lookup(gs, addr, vni); + if (!geneve) + goto drop; - skb = udp_tunnel_handle_offloads(skb, csum, (opt_len == 0)); - if (IS_ERR(skb)) - return PTR_ERR(skb); + if (ip_tunnel_collect_metadata() || gs->collect_md) { + __be16 flags; - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | + (gnvh->oam ? TUNNEL_OAM : 0) | + (gnvh->critical ? TUNNEL_CRIT_OPT : 0); - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + tun_dst = &buf.dst; + ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, flags, + vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4); + /* Update tunnel dst according to Geneve options. */ + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + gnvh->options, gnvh->opt_len * 4); + } else { + /* Drop packets w/ critical options, + * since we don't support any... + */ + tun_dst = NULL; + if (gnvh->critical) + goto drop; + } + + skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev))); + skb->protocol = eth_type_trans(skb, geneve->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + if (tun_dst) + ovs_skb_dst_set(skb, &tun_dst->dst); + else + goto drop; + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) + goto drop; + + skb_reset_network_header(skb); + + err = IP_ECN_decapsulate(iph, skb); + + if (unlikely(err)) { + if (err > 1) { + ++geneve->dev->stats.rx_frame_errors; + ++geneve->dev->stats.rx_errors; + goto drop; + } + } - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); +#ifdef HAVE_DEV_TSTATS + stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)geneve->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); +#endif + netdev_port_receive(skb, &tun_dst->u.tun_info); + return; +drop: + /* Consume bad packet */ + kfree_skb(skb); +} + +#ifdef HAVE_DEV_TSTATS +/* Setup stats when device is created */ +static int geneve_init(struct net_device *dev) +{ + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } -EXPORT_SYMBOL_GPL(rpl_geneve_xmit_skb); + +static void geneve_uninit(struct net_device *dev) +{ + free_percpu(dev->tstats); +} +#endif /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) @@ -131,7 +244,6 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) /* Return packets with reserved bits set */ geneveh = geneve_hdr(skb); - if (unlikely(geneveh->ver != GENEVE_VER)) goto error; @@ -147,7 +259,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!gs) goto drop; - gs->rcv(gs, skb); + geneve_rx(gs, skb); return 0; drop: @@ -186,14 +298,135 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, return sock; } +#ifdef HAVE_UDP_OFFLOAD +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +static int geneve_hlen(struct genevehdr *gh) +{ + return sizeof(*gh) + gh->opt_len * 4; +} + +#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF +static struct sk_buff **geneve_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +#else +static struct sk_buff **geneve_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) +#endif +{ + struct sk_buff *p, **pp = NULL; + struct genevehdr *gh, *gh2; + unsigned int hlen, gh_len, off_gnv; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_gnv = skb_gro_offset(skb); + hlen = off_gnv + sizeof(*gh); + gh = skb_gro_header_fast(skb, off_gnv); + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + if (gh->ver != GENEVE_VER || gh->oam) + goto out; + gh_len = geneve_hlen(gh); + + hlen = off_gnv + gh_len; + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + gh2 = (struct genevehdr *)(p->data + off_gnv); + if (gh->opt_len != gh2->opt_len || + memcmp(gh, gh2, gh_len)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) { + flush = 1; + goto out_unlock; + } + + skb_gro_pull(skb, gh_len); + skb_gro_postpull_rcsum(skb, gh, gh_len); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF +static int geneve_gro_complete(struct sk_buff *skb, int nhoff) +#else +static int geneve_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) +#endif +{ + struct genevehdr *gh; + struct packet_offload *ptype; + __be16 type; + int gh_len; + int err = -ENOSYS; + + udp_tunnel_gro_complete(skb, nhoff); + + gh = (struct genevehdr *)(skb->data + nhoff); + gh_len = geneve_hlen(gh); + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); + + rcu_read_unlock(); + return err; +} +#endif + /* Create new listen socket if needed */ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, bool ipv6) { + struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; struct socket *sock; struct udp_tunnel_sock_cfg tunnel_cfg; + int h; gs = kzalloc(sizeof(*gs), GFP_KERNEL); if (!gs) @@ -206,39 +439,671 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, } gs->sock = sock; - gs->rcv = rcv; - gs->rcv_data = data; + gs->refcnt = 1; + for (h = 0; h < VNI_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&gs->vni_list[h]); + /* Initialize the geneve udp offloads structure */ +#ifdef HAVE_UDP_OFFLOAD + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; + gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; + geneve_notify_add_rx_port(gs); +#endif /* Mark socket as an encapsulation socket */ tunnel_cfg.sk_user_data = gs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = geneve_udp_encap_recv; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - + list_add(&gs->list, &gn->sock_list); return gs; } -struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) +static void geneve_notify_del_rx_port(struct geneve_sock *gs) { - return geneve_socket_create(net, port, rcv, data, ipv6); +#ifdef HAVE_UDP_OFFLOAD + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + + if (sa_family == AF_INET) + udp_del_offload(&gs->udp_offloads); +#endif } -EXPORT_SYMBOL_GPL(rpl_geneve_sock_add); -static void rcu_free_gs(struct rcu_head *rcu) +static void free_gs_rcu(struct rcu_head *rcu) { struct geneve_sock *gs = container_of(rcu, struct geneve_sock, rcu); kfree(gs); } -void rpl_geneve_sock_release(struct geneve_sock *gs) +static void geneve_sock_release(struct geneve_sock *gs) { + if (--gs->refcnt) + return; + + list_del(&gs->list); + geneve_notify_del_rx_port(gs); udp_tunnel_sock_release(gs->sock); - call_rcu(&gs->rcu, rcu_free_gs); + call_rcu(&gs->rcu, free_gs_rcu); } -EXPORT_SYMBOL_GPL(rpl_geneve_sock_release); -#endif /* kernel < 4.0 */ +static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, + __be16 dst_port) +{ + struct geneve_sock *gs; + + list_for_each_entry(gs, &gn->sock_list, list) { + if (inet_sport(gs->sock->sk) == dst_port && + inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) { + return gs; + } + } + return NULL; +} + +static int geneve_open(struct net_device *dev) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct net *net = geneve->net; + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + __u32 hash; + + gs = geneve_find_sock(gn, geneve->dst_port); + if (gs) { + gs->refcnt++; + goto out; + } + + gs = geneve_socket_create(net, geneve->dst_port, false); + if (IS_ERR(gs)) + return PTR_ERR(gs); + +out: + gs->collect_md = geneve->collect_md; + geneve->sock = gs; + + hash = geneve_net_vni_hash(geneve->vni); + hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]); + return 0; +} + +static int geneve_stop(struct net_device *dev) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct geneve_sock *gs = geneve->sock; + + if (!hlist_unhashed(&geneve->hlist)) + hlist_del_rcu(&geneve->hlist); + geneve_sock_release(gs); + return 0; +} + +static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + + min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) { + kfree_skb(skb); + goto free_rt; + } + + skb = vlan_hwaccel_push_inside(skb); + if (!skb) { + err = -ENOMEM; + goto free_rt; + } + + skb = udp_tunnel_handle_offloads(skb, csum, 0, false); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_rt; + } + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + gnvh->ver = GENEVE_VER; + gnvh->opt_len = opt_len / 4; + gnvh->oam = !!(tun_flags & TUNNEL_OAM); + gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + gnvh->rsvd1 = 0; + memcpy(gnvh->vni, vni, 3); + gnvh->proto_type = htons(ETH_P_TEB); + gnvh->rsvd2 = 0; + memcpy(gnvh->options, opt, opt_len); + + ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + return 0; + +free_rt: + ip_rt_put(rt); + return err; +} + +static struct rtable *geneve_get_rt(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl4, + struct ip_tunnel_info *info) +{ + struct geneve_dev *geneve = netdev_priv(dev); + struct rtable *rt = NULL; + __u8 tos; + + memset(fl4, 0, sizeof(*fl4)); + fl4->flowi4_mark = skb->mark; + fl4->flowi4_proto = IPPROTO_UDP; + + if (info) { + fl4->daddr = info->key.u.ipv4.dst; + fl4->saddr = info->key.u.ipv4.src; + fl4->flowi4_tos = RT_TOS(info->key.tos); + } else { + tos = geneve->tos; + if (tos == 1) { + const struct iphdr *iip = ip_hdr(skb); + + tos = ip_tunnel_get_dsfield(iip, skb); + } + + fl4->flowi4_tos = RT_TOS(tos); + fl4->daddr = geneve->remote.sin_addr.s_addr; + } + + rt = ip_route_output_key(geneve->net, fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr); + dev->stats.tx_carrier_errors++; + return rt; + } + if (rt_dst(rt).dev == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr); + dev->stats.collisions++; + ip_rt_put(rt); + return ERR_PTR(-EINVAL); + } + return rt; +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct geneve_dev *geneve = netdev_priv(dev); + struct geneve_sock *gs = geneve->sock; + struct ip_tunnel_info *info = NULL; + struct rtable *rt = NULL; + const struct iphdr *iip; /* interior IP header */ + struct flowi4 fl4; + __u8 tos, ttl; + __be16 sport; + bool udp_csum; + __be16 df; + int err; + + if (geneve->collect_md) { + info = skb_tunnel_info(skb); + if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) { + netdev_dbg(dev, "no tunnel metadata\n"); + goto tx_error; + } + if (info && ip_tunnel_info_af(info) != AF_INET) + goto tx_error; + } + + rt = geneve_get_rt(skb, dev, &fl4, info); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); + dev->stats.tx_carrier_errors++; + goto tx_error; + } + + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); + skb_reset_mac_header(skb); + + iip = ip_hdr(skb); + + if (info) { + const struct ip_tunnel_key *key = &info->key; + u8 *opts = NULL; + u8 vni[3]; + + tunnel_id_to_vni(key->tun_id, vni); + if (key->tun_flags & TUNNEL_GENEVE_OPT) + opts = ip_tunnel_info_opts(info); + + udp_csum = !!(key->tun_flags & TUNNEL_CSUM); + err = geneve_build_skb(rt, skb, key->tun_flags, vni, + info->options_len, opts, udp_csum); + if (unlikely(err)) + goto err; + + tos = ip_tunnel_ecn_encap(key->tos, iip, skb); + ttl = key->ttl; + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + } else { + udp_csum = false; + err = geneve_build_skb(rt, skb, 0, geneve->vni, + 0, NULL, udp_csum); + if (unlikely(err)) + goto err; + + tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb); + ttl = geneve->ttl; + if (!ttl && IN_MULTICAST(ntohl(fl4.daddr))) + ttl = 1; + ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt)); + df = 0; + } + err = udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr, + tos, ttl, df, sport, geneve->dst_port, + !net_eq(geneve->net, dev_net(geneve->dev)), + !udp_csum); + + iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *) dev->tstats); + return NETDEV_TX_OK; + +tx_error: + dev_kfree_skb(skb); +err: + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} +EXPORT_SYMBOL(rpl_geneve_xmit); + +static netdev_tx_t geneve_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ + + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + +static const struct net_device_ops geneve_netdev_ops = { +#ifdef HAVE_DEV_TSTATS + .ndo_init = geneve_init, + .ndo_uninit = geneve_uninit, + .ndo_get_stats64 = ip_tunnel_get_stats64, +#endif + .ndo_open = geneve_open, + .ndo_stop = geneve_stop, + .ndo_start_xmit = geneve_dev_xmit, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, +}; + +static void geneve_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version)); + strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver)); +} + +static const struct ethtool_ops geneve_ethtool_ops = { + .get_drvinfo = geneve_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +/* Info for udev, that this is a virtual tunnel endpoint */ +static struct device_type geneve_type = { + .name = "geneve", +}; + +/* Initialize the device structure. */ +static void geneve_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &geneve_netdev_ops; + dev->ethtool_ops = &geneve_ethtool_ops; + dev->destructor = free_netdev; + + SET_NETDEV_DEVTYPE(dev, &geneve_type); + + dev->features |= NETIF_F_LLTX; + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_GSO_SOFTWARE; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; +#endif +#if 0 + /* Not required */ + netif_keep_dst(dev); +#endif + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + eth_hw_addr_random(dev); +} + +static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { + [IFLA_GENEVE_ID] = { .type = NLA_U32 }, + [IFLA_GENEVE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, + [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, + [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, + [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, +}; + +static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + return -EINVAL; + + if (data[IFLA_GENEVE_ID]) { + __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); + + if (vni >= GENEVE_VID_MASK) + return -ERANGE; + } + + return 0; +} + +static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, + __be16 dst_port, + __be32 rem_addr, + u8 vni[], + bool *tun_on_same_port, + bool *tun_collect_md) +{ + struct geneve_dev *geneve, *t; + + *tun_on_same_port = false; + *tun_collect_md = false; + t = NULL; + list_for_each_entry(geneve, &gn->geneve_list, next) { + if (geneve->dst_port == dst_port) { + *tun_collect_md = geneve->collect_md; + *tun_on_same_port = true; + } + if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && + rem_addr == geneve->remote.sin_addr.s_addr && + dst_port == geneve->dst_port) + t = geneve; + } + return t; +} + +static int geneve_configure(struct net *net, struct net_device *dev, + __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos, + __be16 dst_port, bool metadata) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_dev *t, *geneve = netdev_priv(dev); + bool tun_collect_md, tun_on_same_port; + int err; + + if (metadata) { + if (rem_addr || vni || tos || ttl) + return -EINVAL; + } + + geneve->net = net; + geneve->dev = dev; + + geneve->vni[0] = (vni & 0x00ff0000) >> 16; + geneve->vni[1] = (vni & 0x0000ff00) >> 8; + geneve->vni[2] = vni & 0x000000ff; + + geneve->remote.sin_addr.s_addr = rem_addr; + if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr))) + return -EINVAL; + + geneve->ttl = ttl; + geneve->tos = tos; + geneve->dst_port = dst_port; + geneve->collect_md = metadata; + + t = geneve_find_dev(gn, dst_port, rem_addr, geneve->vni, + &tun_on_same_port, &tun_collect_md); + if (t) + return -EBUSY; + + if (metadata) { + if (tun_on_same_port) + return -EPERM; + } else { + if (tun_collect_md) + return -EPERM; + } + + err = register_netdevice(dev); + if (err) + return err; + + list_add(&geneve->next, &gn->geneve_list); + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) +static int geneve_newlink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = &init_net; +#else +static int geneve_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ +#endif + __be16 dst_port = htons(GENEVE_UDP_PORT); + __u8 ttl = 0, tos = 0; + bool metadata = false; + __be32 rem_addr; + __u32 vni; + + if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE]) + return -EINVAL; + + vni = nla_get_u32(data[IFLA_GENEVE_ID]); + rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); + + if (data[IFLA_GENEVE_TTL]) + ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); + + if (data[IFLA_GENEVE_TOS]) + tos = nla_get_u8(data[IFLA_GENEVE_TOS]); + + if (data[IFLA_GENEVE_PORT]) + dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]); + + if (data[IFLA_GENEVE_COLLECT_METADATA]) + metadata = true; + + return geneve_configure(net, dev, rem_addr, vni, + ttl, tos, dst_port, metadata); +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) +static void geneve_dellink(struct net_device *dev) +#else +static void geneve_dellink(struct net_device *dev, struct list_head *head) +#endif +{ + struct geneve_dev *geneve = netdev_priv(dev); + + list_del(&geneve->next); + unregister_netdevice_queue(dev, head); +} + +static size_t geneve_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ + nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ + nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ + nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ + 0; +} + +static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct geneve_dev *geneve = netdev_priv(dev); + __u32 vni; + + vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2]; + if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) + goto nla_put_failure; + + if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, + geneve->remote.sin_addr.s_addr)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) || + nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos)) + goto nla_put_failure; + + if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port)) + goto nla_put_failure; + + if (geneve->collect_md) { + if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA)) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static struct rtnl_link_ops geneve_link_ops __read_mostly = { + .kind = "ovs_geneve", + .maxtype = IFLA_GENEVE_MAX, + .policy = geneve_policy, + .priv_size = sizeof(struct geneve_dev), + .setup = geneve_setup, + .validate = geneve_validate, + .newlink = geneve_newlink, + .dellink = geneve_dellink, + .get_size = geneve_get_size, + .fill_info = geneve_fill_info, +}; + +struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + int err; + + memset(tb, 0, sizeof(tb)); + dev = rtnl_create_link(net, (char *) name, name_assign_type, + &geneve_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = geneve_configure(net, dev, 0, 0, 0, 0, htons(dst_port), true); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + return dev; +} +EXPORT_SYMBOL_GPL(rpl_geneve_dev_create_fb); + +static __net_init int geneve_init_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + + INIT_LIST_HEAD(&gn->geneve_list); + INIT_LIST_HEAD(&gn->sock_list); + return 0; +} + +static void __net_exit geneve_exit_net(struct net *net) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_dev *geneve, *next; + struct net_device *dev, *aux; + LIST_HEAD(list); + + rtnl_lock(); + + /* gather any geneve devices that were moved into this ns */ + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &geneve_link_ops) + unregister_netdevice_queue(dev, &list); + + /* now gather any other geneve devices that were created in this ns */ + list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) { + /* If geneve->dev is in the same netns, it was already added + * to the list by the previous loop. + */ + if (!net_eq(dev_net(geneve->dev), net)) + unregister_netdevice_queue(geneve->dev, &list); + } + + /* unregister the devices gathered above */ + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations geneve_net_ops = { + .init = geneve_init_net, + .exit = geneve_exit_net, + .id = &geneve_net_id, + .size = sizeof(struct geneve_net), +}; + +DEFINE_COMPAT_PNET_REG_FUNC(device) +int rpl_geneve_init_module(void) +{ + int rc; + + rc = register_pernet_subsys(&geneve_net_ops); + if (rc) + goto out1; + + rc = rtnl_link_register(&geneve_link_ops); + if (rc) + goto out2; + + pr_info("Geneve tunneling driver\n"); + return 0; +out2: + unregister_pernet_subsys(&geneve_net_ops); +out1: + return rc; +} + +void rpl_geneve_cleanup_module(void) +{ + rtnl_link_unregister(&geneve_link_ops); + unregister_pernet_subsys(&geneve_net_ops); +} +#endif diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c index fe8138014..fa8d9368f 100644 --- a/datapath/linux/compat/gre.c +++ b/datapath/linux/compat/gre.c @@ -38,9 +38,10 @@ #include "gso.h" +#ifndef HAVE_METADATA_DST #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) +#ifndef HAVE_GRE_HANDLE_OFFLOADS #ifndef HAVE_GRE_CISCO_REGISTER @@ -147,6 +148,43 @@ static __sum16 check_checksum(struct sk_buff *skb) return csum; } +#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen +static int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + +#define gre_flags_to_tnl_flags rpl_gre_flags_to_tnl_flags +static __be16 gre_flags_to_tnl_flags(__be16 flags) +{ + __be16 tflags = 0; + + if (flags & GRE_CSUM) + tflags |= TUNNEL_CSUM; + if (flags & GRE_ROUTING) + tflags |= TUNNEL_ROUTING; + if (flags & GRE_KEY) + tflags |= TUNNEL_KEY; + if (flags & GRE_SEQ) + tflags |= TUNNEL_SEQ; + if (flags & GRE_STRICT) + tflags |= TUNNEL_STRICT; + if (flags & GRE_REC) + tflags |= TUNNEL_REC; + if (flags & GRE_VERSION) + tflags |= TUNNEL_VERSION; + + return tflags; +} + static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -269,86 +307,7 @@ int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto) EXPORT_SYMBOL_GPL(rpl_gre_cisco_unregister); #endif /* !HAVE_GRE_CISCO_REGISTER */ - -/* GRE TX side. */ -static void gre_nop_fix(struct sk_buff *skb) { } - -static void gre_csum_fix(struct sk_buff *skb) -{ - struct gre_base_hdr *greh; - __be32 *options; - int gre_offset = skb_transport_offset(skb); - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - options = ((__be32 *)greh + 1); - - *options = 0; - *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0)); -} - -static bool is_gre_gso(struct sk_buff *skb) -{ - return skb_is_gso(skb); -} - -void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - __skb_push(skb, hdr_len); - - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags & TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags & TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags & TUNNEL_CSUM && !is_gre_gso(skb)) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } - - ovs_skb_set_inner_protocol(skb, tpi->proto); -} -EXPORT_SYMBOL_GPL(rpl_gre_build_header); - -struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; - gso_fix_segment_t fix_segment; - - if (gre_csum) - fix_segment = gre_csum_fix; - else - fix_segment = gre_nop_fix; - - return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment); -} -#else -struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { - kfree_skb(skb); - return ERR_PTR(-ENOSYS); - } - skb_clear_ovs_gso_cb(skb); -#undef gre_handle_offloads - return gre_handle_offloads(skb, gre_csum); -} #endif -EXPORT_SYMBOL_GPL(rpl_gre_handle_offloads); #endif /* CONFIG_NET_IPGRE_DEMUX */ +#endif /* HAVE_METADATA_DST */ diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c index 2c19b5890..c52b2b136 100644 --- a/datapath/linux/compat/gso.c +++ b/datapath/linux/compat/gso.c @@ -130,7 +130,7 @@ int rpl_dev_queue_xmit(struct sk_buff *skb) if (mpls) features &= NETIF_F_SG; - if (netif_needs_gso(skb->dev, skb, features)) { + if (netif_needs_gso(skb, features)) { struct sk_buff *nskb; nskb = skb_gso_segment(skb, features); diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h index 6fcaff8d6..eb756ebe0 100644 --- a/datapath/linux/compat/gso.h +++ b/datapath/linux/compat/gso.h @@ -2,30 +2,36 @@ #define __LINUX_GSO_WRAPPER_H #include <linux/version.h> -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) - -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/protocol.h> - #include "datapath.h" + typedef void (*gso_fix_segment_t)(struct sk_buff *); struct ovs_gso_cb { struct ovs_skb_cb dp_cb; +#ifndef HAVE_METADATA_DST + struct metadata_dst *tun_dst; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) gso_fix_segment_t fix_segment; -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) +#endif +#ifndef HAVE_INNER_PROTOCOL __be16 inner_protocol; #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +#ifndef HAVE_INNER_MAC_HEADER unsigned int inner_mac_header; #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) +#ifndef HAVE_INNER_NETWORK_HEADER unsigned int inner_network_header; #endif }; #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/protocol.h> + static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) { OVS_GSO_CB(skb)->fix_segment = NULL; @@ -37,7 +43,7 @@ static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) } #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +#ifndef HAVE_INNER_MAC_HEADER static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + OVS_GSO_CB(skb)->inner_mac_header; @@ -48,9 +54,9 @@ static inline void skb_set_inner_mac_header(const struct sk_buff *skb, { OVS_GSO_CB(skb)->inner_mac_header = (skb->data - skb->head) + offset; } -#endif +#endif /* HAVE_INNER_MAC_HEADER */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) +#ifndef HAVE_INNER_NETWORK_HEADER static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) { return skb->head + OVS_GSO_CB(skb)->inner_network_header; @@ -88,15 +94,17 @@ static inline int ovs_skb_inner_transport_offset(const struct sk_buff *skb) return skb_inner_transport_header(skb) - skb->data; } -#endif +#endif /* HAVE_INNER_NETWORK_HEADER */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) -static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) { +#ifndef HAVE_INNER_PROTOCOL +static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) +{ OVS_GSO_CB(skb)->inner_protocol = htons(0); } static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb, - __be16 ethertype) { + __be16 ethertype) +{ OVS_GSO_CB(skb)->inner_protocol = ethertype; } @@ -107,31 +115,28 @@ static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) #else -static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) { +static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) +{ /* Nothing to do. The inner_protocol is either zero or * has been set to a value by another user. * Either way it may be considered initialised. */ } -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) -static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb, - __be16 ethertype) +static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) { - skb->inner_protocol = ethertype; + return skb->inner_protocol; } + +#ifdef ENCAP_TYPE_ETHER +#define ovs_skb_set_inner_protocol skb_set_inner_protocol #else static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb, __be16 ethertype) { - skb_set_inner_protocol(skb, ethertype); -} -#endif - -static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) -{ - return skb->inner_protocol; + skb->inner_protocol = ethertype; } +#endif /* ENCAP_TYPE_ETHER */ #endif /* 3.11 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) @@ -153,4 +158,40 @@ static inline void skb_reset_inner_headers(struct sk_buff *skb) } #endif /* 3.18 */ +#ifndef HAVE_METADATA_DST +/* We need two separate functions to manage different dst in this case. + * First is dst_entry and second is tunnel-dst. + * So define ovs_* separate functions for tun_dst. + */ +static inline void ovs_skb_dst_set(struct sk_buff *skb, void *dst) +{ + OVS_GSO_CB(skb)->tun_dst = (void *)dst; +} + +static inline struct ip_tunnel_info *ovs_skb_tunnel_info(struct sk_buff *skb) +{ + return &OVS_GSO_CB(skb)->tun_dst->u.tun_info; +} + +static inline void ovs_skb_dst_drop(struct sk_buff *skb) +{ + OVS_GSO_CB(skb)->tun_dst = NULL; +} + +static inline void ovs_dst_hold(void *dst) +{ +} + +static inline void ovs_dst_release(struct dst_entry *dst) +{ + kfree(dst); +} + +#else +#define ovs_skb_dst_set skb_dst_set +#define ovs_skb_dst_drop skb_dst_drop +#define ovs_dst_hold dst_hold +#define ovs_dst_release dst_release +#endif + #endif diff --git a/datapath/linux/compat/include/linux/etherdevice.h b/datapath/linux/compat/include/linux/etherdevice.h index c9c0a999d..850b7798d 100644 --- a/datapath/linux/compat/include/linux/etherdevice.h +++ b/datapath/linux/compat/include/linux/etherdevice.h @@ -64,4 +64,29 @@ static inline bool eth_proto_is_802_3(__be16 proto) } #endif +#define ether_addr_equal rpl_ether_addr_equal +static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | + ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); + + return fold == 0; +#else + const u16 *a = (const u16 *)addr1; + const u16 *b = (const u16 *)addr2; + + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; +#endif +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) +#define eth_gro_receive rpl_eth_gro_receive +struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head, + struct sk_buff *skb); + +#define eth_gro_complete rpl_eth_gro_complete +int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff); +#endif + #endif diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h new file mode 100644 index 000000000..6209dcbad --- /dev/null +++ b/datapath/linux/compat/include/linux/if_link.h @@ -0,0 +1,151 @@ +#ifndef _LINUX_IF_LINK_WRAPPER_H +#define _LINUX_IF_LINK_WRAPPER_H + +#include_next<linux/if_link.h> + +/* GENEVE section */ +enum { +#define IFLA_GENEVE_UNSPEC rpl_IFLA_GENEVE_UNSPEC + IFLA_GENEVE_UNSPEC, + +#define IFLA_GENEVE_ID rpl_IFLA_GENEVE_ID + IFLA_GENEVE_ID, + +#define IFLA_GENEVE_REMOTE rpl_IFLA_GENEVE_REMOTE + IFLA_GENEVE_REMOTE, + +#define IFLA_GENEVE_TTL rpl_IFLA_GENEVE_TTL + IFLA_GENEVE_TTL, + +#define IFLA_GENEVE_TOS rpl_IFLA_GENEVE_TOS + IFLA_GENEVE_TOS, + +#define IFLA_GENEVE_PORT rpl_IFLA_GENEVE_PORT + IFLA_GENEVE_PORT, /* destination port */ + +#define IFLA_GENEVE_COLLECT_METADATA rpl_IFLA_GENEVE_COLLECT_METADATA + IFLA_GENEVE_COLLECT_METADATA, + +#define __IFLA_GENEVE_MAX rpl__IFLA_GENEVE_MAX + __IFLA_GENEVE_MAX +}; +#undef IFLA_GENEVE_MAX +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +/* STT section */ +enum { + IFLA_STT_PORT, /* destination port */ + __IFLA_STT_MAX +}; +#define IFLA_STT_MAX (__IFLA_STT_MAX - 1) + +/* LISP section */ +enum { + IFLA_LISP_PORT, /* destination port */ + __IFLA_LISP_MAX +}; +#define IFLA_LISP_MAX (__IFLA_LISP_MAX - 1) + +/* VXLAN section */ +enum { +#define IFLA_VXLAN_UNSPEC rpl_IFLA_VXLAN_UNSPEC + IFLA_VXLAN_UNSPEC, +#define IFLA_VXLAN_ID rpl_IFLA_VXLAN_ID + IFLA_VXLAN_ID, +#define IFLA_VXLAN_GROUP rpl_IFLA_VXLAN_GROUP + IFLA_VXLAN_GROUP, /* group or remote address */ +#define IFLA_VXLAN_LINK rpl_IFLA_VXLAN_LINK + IFLA_VXLAN_LINK, +#define IFLA_VXLAN_LOCAL rpl_IFLA_VXLAN_LOCAL + IFLA_VXLAN_LOCAL, +#define IFLA_VXLAN_TTL rpl_IFLA_VXLAN_TTL + IFLA_VXLAN_TTL, +#define IFLA_VXLAN_TOS rpl_IFLA_VXLAN_TOS + IFLA_VXLAN_TOS, +#define IFLA_VXLAN_LEARNING rpl_IFLA_VXLAN_LEARNING + IFLA_VXLAN_LEARNING, +#define IFLA_VXLAN_AGEING rpl_IFLA_VXLAN_AGEING + IFLA_VXLAN_AGEING, +#define IFLA_VXLAN_LIMIT rpl_IFLA_VXLAN_LIMIT + IFLA_VXLAN_LIMIT, +#define IFLA_VXLAN_PORT_RANGE rpl_IFLA_VXLAN_PORT_RANGE + IFLA_VXLAN_PORT_RANGE, /* source port */ +#define IFLA_VXLAN_PROXY rpl_IFLA_VXLAN_PROXY + IFLA_VXLAN_PROXY, +#define IFLA_VXLAN_RSC rpl_IFLA_VXLAN_RSC + IFLA_VXLAN_RSC, +#define IFLA_VXLAN_L2MISS rpl_IFLA_VXLAN_L2MISS + IFLA_VXLAN_L2MISS, +#define IFLA_VXLAN_L3MISS rpl_IFLA_VXLAN_L3MISS + IFLA_VXLAN_L3MISS, +#define IFLA_VXLAN_PORT rpl_IFLA_VXLAN_PORT + IFLA_VXLAN_PORT, /* destination port */ +#define IFLA_VXLAN_GROUP6 rpl_IFLA_VXLAN_GROUP6 + IFLA_VXLAN_GROUP6, +#define IFLA_VXLAN_LOCAL6 rpl_IFLA_VXLAN_LOCAL6 + IFLA_VXLAN_LOCAL6, +#define IFLA_VXLAN_UDP_CSUM rpl_IFLA_VXLAN_UDP_CSUM + IFLA_VXLAN_UDP_CSUM, +#define IFLA_VXLAN_UDP_ZERO_CSUM6_TX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_TX + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, +#define IFLA_VXLAN_UDP_ZERO_CSUM6_RX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_RX + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, +#define IFLA_VXLAN_REMCSUM_TX rpl_IFLA_VXLAN_REMCSUM_TX + IFLA_VXLAN_REMCSUM_TX, +#define IFLA_VXLAN_REMCSUM_RX rpl_IFLA_VXLAN_REMCSUM_RX + IFLA_VXLAN_REMCSUM_RX, +#define IFLA_VXLAN_GBP rpl_IFLA_VXLAN_GBP + IFLA_VXLAN_GBP, +#define IFLA_VXLAN_REMCSUM_NOPARTIAL rpl_IFLA_VXLAN_REMCSUM_NOPARTIAL + IFLA_VXLAN_REMCSUM_NOPARTIAL, +#define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA + IFLA_VXLAN_COLLECT_METADATA, +#define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX + __IFLA_VXLAN_MAX +}; + +#undef IFLA_VXLAN_MAX +#define IFLA_VXLAN_MAX (rpl___IFLA_VXLAN_MAX - 1) + +#define ifla_vxlan_port_range rpl_ifla_vxlan_port_range +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +#ifndef HAVE_RTNL_LINK_STATS64 +/* The main device statistics structure */ +struct rtnl_link_stats64 { + __u64 rx_packets; /* total packets received */ + __u64 tx_packets; /* total packets transmitted */ + __u64 rx_bytes; /* total bytes received */ + __u64 tx_bytes; /* total bytes transmitted */ + __u64 rx_errors; /* bad packets received */ + __u64 tx_errors; /* packet transmit problems */ + __u64 rx_dropped; /* no space in linux buffers */ + __u64 tx_dropped; /* no space available in linux */ + __u64 multicast; /* multicast packets received */ + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; /* receiver ring buff overflow */ + __u64 rx_crc_errors; /* recved pkt with crc error */ + __u64 rx_frame_errors; /* recv'd frame alignment error */ + __u64 rx_fifo_errors; /* recv'r fifo overrun */ + __u64 rx_missed_errors; /* receiver missed packet */ + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; +}; +#endif + +#endif diff --git a/datapath/linux/compat/include/linux/if_vlan.h b/datapath/linux/compat/include/linux/if_vlan.h index 060bb62ba..a8d7bfab6 100644 --- a/datapath/linux/compat/include/linux/if_vlan.h +++ b/datapath/linux/compat/include/linux/if_vlan.h @@ -177,4 +177,56 @@ static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci) #define skb_vlan_tag_get(skb) vlan_tx_tag_get(skb) #endif +#ifndef HAVE_VLAN_GET_PROTOCOL + +static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, + int *depth) +{ + unsigned int vlan_depth = skb->mac_len; + + /* if type is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (WARN_ON(vlan_depth < VLAN_HLEN)) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); + } + + if (depth) + *depth = vlan_depth; + + return type; +} + +/** + * vlan_get_protocol - get protocol EtherType. + * @skb: skbuff to query + * + * Returns the EtherType of the packet, regardless of whether it is + * vlan encapsulated (normal or hardware accelerated) or not. + */ +static inline __be16 vlan_get_protocol(struct sk_buff *skb) +{ + return __vlan_get_protocol(skb, skb->protocol, NULL); +} + +#endif #endif /* linux/if_vlan.h wrapper */ diff --git a/datapath/linux/compat/include/linux/list.h b/datapath/linux/compat/include/linux/list.h index 18cce8a37..4234c17ce 100644 --- a/datapath/linux/compat/include/linux/list.h +++ b/datapath/linux/compat/include/linux/list.h @@ -23,4 +23,9 @@ #endif +#ifndef list_first_entry_or_null +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#endif + #endif diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h index 04eb77dc0..e4a310729 100644 --- a/datapath/linux/compat/include/linux/netdev_features.h +++ b/datapath/linux/compat/include/linux/netdev_features.h @@ -5,21 +5,23 @@ #include_next <linux/netdev_features.h> #endif +#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0) +/* On RHEL 6, netdev features are defined in netdevice.h header. */ +#include <linux/netdevice.h> +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) #define NETIF_F_HW_VLAN_CTAG_TX NETIF_F_HW_VLAN_TX #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) -#define NETIF_F_GSO_ENCAP_ALL 0 - -#else - #ifndef NETIF_F_GSO_GRE #define NETIF_F_GSO_GRE 0 #endif #ifndef NETIF_F_GSO_GRE_CSUM #define NETIF_F_GSO_GRE_CSUM 0 +#else +#define HAVE_NETIF_F_GSO_GRE_CSUM #endif #ifndef NETIF_F_GSO_IPIP @@ -32,16 +34,39 @@ #ifndef NETIF_F_GSO_UDP_TUNNEL #define NETIF_F_GSO_UDP_TUNNEL 0 +#else +#define HAVE_NETIF_F_GSO_UDP_TUNNEL 0 #endif #ifndef NETIF_F_GSO_UDP_TUNNEL_CSUM #define NETIF_F_GSO_UDP_TUNNEL_CSUM 0 +#define SKB_GSO_UDP_TUNNEL_CSUM 0 #endif #ifndef NETIF_F_GSO_MPLS #define NETIF_F_GSO_MPLS 0 #endif +#ifndef NETIF_F_HW_VLAN_STAG_TX +#define NETIF_F_HW_VLAN_STAG_TX 0 +#endif + +#ifndef NETIF_F_GSO_TUNNEL_REMCSUM +#define NETIF_F_GSO_TUNNEL_REMCSUM 0 +#define SKB_GSO_TUNNEL_REMCSUM 0 +#else +/* support for REM_CSUM is added in 3.19 but API are not defined + * till 4.0, so turn on REMSUM support on kernel 4.0 onwards. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) +#define HAVE_NETIF_F_GSO_TUNNEL_REMCSUM +#endif +#endif + +#ifndef NETIF_F_RXCSUM +#define NETIF_F_RXCSUM 0 +#endif + #ifndef NETIF_F_GSO_ENCAP_ALL #define NETIF_F_GSO_ENCAP_ALL (NETIF_F_GSO_GRE | \ NETIF_F_GSO_GRE_CSUM | \ @@ -52,6 +77,16 @@ NETIF_F_GSO_MPLS) #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) +#define SKB_GSO_GRE 0 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +#define SKB_GSO_UDP_TUNNEL 0 +#endif + +#ifndef HAVE_NETIF_F_GSO_GRE_CSUM +#define SKB_GSO_GRE_CSUM 0 #endif #endif diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h index 0fb2144c6..576989da5 100644 --- a/datapath/linux/compat/include/linux/netdevice.h +++ b/datapath/linux/compat/include/linux/netdevice.h @@ -22,6 +22,13 @@ struct net; #define IFF_LIVE_ADDR_CHANGE 0 #endif +#ifndef IFF_NO_QUEUE +#define IFF_NO_QUEUE 0 +#endif +#ifndef IFF_OPENVSWITCH +#define IFF_OPENVSWITCH 0 +#endif + #ifndef to_net_dev #define to_net_dev(class) container_of(class, struct net_device, NETDEV_DEV_MEMBER) #endif @@ -34,9 +41,8 @@ struct net; #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) -extern void unregister_netdevice_queue(struct net_device *dev, - struct list_head *head); -extern void unregister_netdevice_many(struct list_head *head); +#define unregister_netdevice_queue(dev, head) unregister_netdevice(dev) +#define unregister_netdevice_many(head) #endif #ifndef HAVE_DEV_DISABLE_LRO @@ -112,18 +118,15 @@ struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, netdev_features_t featu netdev_features_t rpl_netif_skb_features(struct sk_buff *skb); #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) -static inline int rpl_netif_needs_gso(struct net_device *dev, - struct sk_buff *skb, int features) +#ifdef HAVE_NETIF_NEEDS_GSO_NETDEV +#define netif_needs_gso rpl_netif_needs_gso +static inline bool netif_needs_gso(struct sk_buff *skb, + netdev_features_t features) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || - unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); -#else - return netif_needs_gso(skb, features); -#endif + unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && + (skb->ip_summed != CHECKSUM_UNNECESSARY))); } -#define netif_needs_gso rpl_netif_needs_gso #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) @@ -175,6 +178,11 @@ struct pcpu_sw_netstats { }; #endif +#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0) +/* Use compat version for all redhas releases */ +#undef netdev_alloc_pcpu_stats +#endif + #ifndef netdev_alloc_pcpu_stats #define netdev_alloc_pcpu_stats(type) \ ({ \ @@ -191,4 +199,89 @@ struct pcpu_sw_netstats { }) #endif +#ifndef NET_NAME_USER +#define NET_NAME_USER 3 +#endif + +#ifndef HAVE_GRO_REMCSUM +struct gro_remcsum { +}; + +#define skb_gro_remcsum_init(grc) +#define skb_gro_remcsum_cleanup(a1, a2) +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) + +#define skb_gro_remcsum_process rpl_skb_gro_remcsum_process +static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, + unsigned int off, size_t hdrlen, + int start, int offset, + struct gro_remcsum *grc, + bool nopartial) +{ + __wsum delta; + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + + BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); + + if (!nopartial) { + NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; + return ptr; + } + + ptr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, off + plen)) { + ptr = skb_gro_header_slow(skb, off + plen, off); + if (!ptr) + return NULL; + } + + delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, + start, offset); + + /* Adjust skb->csum since we changed the packet */ + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + grc->offset = off + hdrlen + offset; + grc->delta = delta; + + return ptr; +} +#endif +#endif + +#ifndef HAVE_RTNL_LINK_STATS64 +#define dev_get_stats rpl_dev_get_stats +struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage); + +#else +#define HAVE_DEV_TSTATS +#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0) +#undef HAVE_DEV_TSTATS #endif +#endif + +#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0) +/* Only required on RHEL 6. */ +#define dev_get_stats dev_get_stats64 +#endif + +#ifndef netdev_dbg +#define netdev_dbg(__dev, format, args...) \ +do { \ + printk(KERN_DEBUG "%s ", __dev->name); \ + printk(KERN_DEBUG format, ##args); \ +} while (0) +#endif + +#ifndef netdev_info +#define netdev_info(__dev, format, args...) \ +do { \ + printk(KERN_INFO "%s ", __dev->name); \ + printk(KERN_INFO format, ##args); \ +} while (0) + +#endif + +#endif /* __LINUX_NETDEVICE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h index 4d81bc80a..0edcbfdd2 100644 --- a/datapath/linux/compat/include/linux/skbuff.h +++ b/datapath/linux/compat/include/linux/skbuff.h @@ -15,25 +15,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, #endif #include_next <linux/skbuff.h> - #include <linux/jhash.h> -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) -#define SKB_GSO_GRE 0 -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) -#define SKB_GSO_UDP_TUNNEL 0 -#endif - -#ifndef HAVE_SKB_GSO_GRE_CSUM -#define SKB_GSO_GRE_CSUM 0 -#endif - -#ifndef HAVE_SKB_GSO_UDP_TUNNEL_CSUM -#define SKB_GSO_UDP_TUNNEL_CSUM 0 -#endif - #ifndef HAVE_IGNORE_DF_RENAME #define ignore_df local_df #endif @@ -403,4 +386,15 @@ static inline unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int le } #endif + +#ifndef HAVE_SKB_SCRUB_PACKET_XNET +#define skb_scrub_packet rpl_skb_scrub_packet +void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet); +#endif + +#define skb_pop_mac_header rpl_skb_pop_mac_header +static inline void skb_pop_mac_header(struct sk_buff *skb) +{ + skb->mac_header = skb->network_header; +} #endif diff --git a/datapath/linux/compat/include/linux/stddef.h b/datapath/linux/compat/include/linux/stddef.h index 9b68f710f..f2b7c319a 100644 --- a/datapath/linux/compat/include/linux/stddef.h +++ b/datapath/linux/compat/include/linux/stddef.h @@ -12,6 +12,11 @@ enum { }; #endif /* !HAVE_BOOL_TYPE */ +#ifndef offsetofend +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) +#endif + #endif /* __KERNEL__ */ #endif diff --git a/datapath/linux/compat/include/net/dst_metadata.h b/datapath/linux/compat/include/net/dst_metadata.h new file mode 100644 index 000000000..f15bb0308 --- /dev/null +++ b/datapath/linux/compat/include/net/dst_metadata.h @@ -0,0 +1,44 @@ +#ifndef __NET_DST_METADATA_WRAPPER_H +#define __NET_DST_METADATA_WRAPPER_H 1 + +#ifdef HAVE_METADATA_DST +#include_next <net/dst_metadata.h> +#else +#include <linux/skbuff.h> +#include <net/ip_tunnels.h> +#include <net/dst.h> + +struct metadata_dst { + unsigned long dst; + union { + struct ip_tunnel_info tun_info; + } u; +}; + +static inline struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return NULL; + + return md_dst; +} +#define skb_tunnel_info ovs_skb_tunnel_info +#endif +static inline void ovs_ip_tun_rx_dst(struct ip_tunnel_info *tun_info, + struct sk_buff *skb, __be16 flags, + __be64 tunnel_id, int md_size) +{ + const struct iphdr *iph = ip_hdr(skb); + + ip_tunnel_key_init(&tun_info->key, + iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, 0, tunnel_id, flags); + tun_info->mode = 0; +} + +void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb, + struct metadata_dst *tun_dst); +#endif /* __NET_DST_METADATA_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/geneve.h b/datapath/linux/compat/include/net/geneve.h index 4f250c2f6..550f4a77e 100644 --- a/datapath/linux/compat/include/net/geneve.h +++ b/datapath/linux/compat/include/net/geneve.h @@ -1,17 +1,24 @@ #ifndef __NET_GENEVE_WRAPPER_H #define __NET_GENEVE_WRAPPER_H 1 -#include <linux/version.h> - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) -#include_next <net/geneve.h> -#else - #ifdef CONFIG_INET #include <net/udp_tunnel.h> #endif +#ifdef HAVE_METADATA_DST +#include_next <net/geneve.h> + +static inline int rpl_geneve_init_module(void) +{ + return 0; +} +static inline void rpl_geneve_cleanup_module(void) +{} + +#define geneve_xmit dev_queue_xmit + +#else /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | @@ -69,43 +76,19 @@ struct genevehdr { }; #ifdef CONFIG_INET -struct geneve_sock; - -typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); - -struct geneve_sock { - geneve_rcv_t *rcv; - void *rcv_data; - struct socket *sock; - struct rcu_head rcu; -}; - -#define GENEVE_VER 0 -#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) - -#define geneve_sock_add rpl_geneve_sock_add -struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6); - -#define geneve_sock_release rpl_geneve_sock_release -void rpl_geneve_sock_release(struct geneve_sock *vs); - -#define geneve_xmit_skb rpl_geneve_xmit_skb -int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet); +#define geneve_dev_create_fb rpl_geneve_dev_create_fb +struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port); #endif /*ifdef CONFIG_INET */ -#endif /* kernel < 4.0 */ +int rpl_geneve_init_module(void); +void rpl_geneve_cleanup_module(void); + +#define geneve_xmit rpl_geneve_xmit +netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb); -#ifndef HAVE_GENEVE_HDR -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} #endif +#define geneve_init_module rpl_geneve_init_module +#define geneve_cleanup_module rpl_geneve_cleanup_module -#endif /*ifdef__NET_GENEVE_WRAPPER_H */ +#endif /*ifdef__NET_GENEVE_H */ diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h index 6e0df0fd8..09053b573 100644 --- a/datapath/linux/compat/include/net/gre.h +++ b/datapath/linux/compat/include/net/gre.h @@ -3,6 +3,19 @@ #include <linux/skbuff.h> #include <net/ip_tunnels.h> +#ifdef HAVE_METADATA_DST +#include_next <net/gre.h> + +static inline int rpl_ipgre_init(void) +{ + return 0; +} +static inline void rpl_ipgre_fini(void) +{} + +#define gre_fb_xmit dev_queue_xmit + +#else #include <linux/version.h> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) || \ @@ -28,81 +41,28 @@ int rpl_gre_cisco_register(struct gre_cisco_protocol *proto); #define gre_cisco_unregister rpl_gre_cisco_unregister int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) +#ifndef GRE_HEADER_SECTION struct gre_base_hdr { __be16 flags; __be16 protocol; }; #define GRE_HEADER_SECTION 4 +#endif -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} - -static inline __be16 tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} -#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */ #endif /* HAVE_GRE_CISCO_REGISTER */ -#define gre_handle_offloads rpl_gre_handle_offloads -struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum); +int rpl_ipgre_init(void); +void rpl_ipgre_fini(void); -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) +#define gretap_fb_dev_create rpl_gretap_fb_dev_create +struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type); -#define gre_build_header rpl_gre_build_header -void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); +#define gre_fb_xmit rpl_gre_fb_xmit +netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb); +#endif /* HAVE_METADATA_DST */ -#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen -static inline int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags & TUNNEL_CSUM) - addend += 4; - if (o_flags & TUNNEL_KEY) - addend += 4; - if (o_flags & TUNNEL_SEQ) - addend += 4; - return addend; -} -#endif +#define ipgre_init rpl_ipgre_init +#define ipgre_fini rpl_ipgre_fini #endif diff --git a/datapath/linux/compat/include/net/inet_ecn.h b/datapath/linux/compat/include/net/inet_ecn.h new file mode 100644 index 000000000..f0591b322 --- /dev/null +++ b/datapath/linux/compat/include/net/inet_ecn.h @@ -0,0 +1,59 @@ +#ifndef _INET_ECN_WRAPPER_H_ +#define _INET_ECN_WRAPPER_H_ + +#include_next <net/inet_ecn.h> + +#define INET_ECN_decapsulate rpl_INET_ECN_decapsulate +static inline int INET_ECN_decapsulate(struct sk_buff *skb, + __u8 outer, __u8 inner) +{ + if (INET_ECN_is_not_ect(inner)) { + switch (outer & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; + case INET_ECN_ECT_0: + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + } + } + + if (INET_ECN_is_ce(outer)) + INET_ECN_set_ce(skb); + + return 0; +} + +#define IP_ECN_decapsulate rpl_IP_ECN_decapsulate +static inline int IP_ECN_decapsulate(const struct iphdr *oiph, + struct sk_buff *skb) +{ + __u8 inner; + + if (skb->protocol == htons(ETH_P_IP)) + inner = ip_hdr(skb)->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield(ipv6_hdr(skb)); + else + return 0; + + return INET_ECN_decapsulate(skb, oiph->tos, inner); +} + +#define IP6_ECN_decapsulate rpl_IP6_ECN_decapsulate +static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, + struct sk_buff *skb) +{ + __u8 inner; + + if (skb->protocol == htons(ETH_P_IP)) + inner = ip_hdr(skb)->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield(ipv6_hdr(skb)); + else + return 0; + + return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); +} +#endif diff --git a/datapath/linux/compat/include/net/ip6_route.h b/datapath/linux/compat/include/net/ip6_route.h new file mode 100644 index 000000000..3f495e783 --- /dev/null +++ b/datapath/linux/compat/include/net/ip6_route.h @@ -0,0 +1,31 @@ +#ifndef __NET_IP6_ROUTE_WRAPPER +#define __NET_IP6_ROUTE_WRAPPER + +#include <net/route.h> +#include <net/ipv6.h> + +#include_next<net/ip6_route.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) + +static inline +struct dst_entry *rpl_ip6_route_output(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.oif = fl6->flowi6_oif; + fl.fl6_dst = fl6->daddr; + fl.fl6_src = fl6->saddr; + fl.mark = fl6->flowi6_mark; + fl.proto = fl6->flowi6_proto; + + return ip6_route_output(net, (struct sock *) sk, &fl); +} +#define ip6_route_output rpl_ip6_route_output + +#define ip6_dst_hoplimit(dst) dst_metric(dst, RTAX_HOPLIMIT) + +#endif /* 2.6.39 */ +#endif diff --git a/datapath/linux/compat/include/net/ip6_tunnel.h b/datapath/linux/compat/include/net/ip6_tunnel.h new file mode 100644 index 000000000..ce650879b --- /dev/null +++ b/datapath/linux/compat/include/net/ip6_tunnel.h @@ -0,0 +1,33 @@ +#ifndef _NET_IP6_TUNNEL_WRAPER_H +#define _NET_IP6_TUNNEL_WRAPER_H + +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <linux/if_tunnel.h> +#include <linux/ip6_tunnel.h> +#include_next <net/ip6_tunnel.h> + +#include "gso.h" + +#define ip6tunnel_xmit rpl_ip6tunnel_xmit +static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, + struct net_device *dev) +{ + int pkt_len, err; + + pkt_len = skb->len - skb_inner_network_offset(skb); + /* TODO: Fix GSO for ipv6 */ +#ifdef HAVE_IP6_LOCAL_OUT_SK + err = ip6_local_out_sk(sk, skb); +#else + err = ip6_local_out(skb); +#endif + if (net_xmit_eval(err) != 0) + pkt_len = net_xmit_eval(err); + else + pkt_len = err; + + iptunnel_xmit_stats(pkt_len, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); +} + +#endif diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h index 3ed6f9193..47dce784d 100644 --- a/datapath/linux/compat/include/net/ip_tunnels.h +++ b/datapath/linux/compat/include/net/ip_tunnels.h @@ -69,11 +69,11 @@ struct tnl_ptk_info { #endif #ifndef TUNNEL_GENEVE_OPT -#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) +#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #endif #ifndef TUNNEL_VXLAN_OPT -#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #endif /* Older kernels defined TUNNEL_OPTIONS_PRESENT to GENEVE only */ @@ -83,4 +83,224 @@ struct tnl_ptk_info { #define skb_is_encapsulated ovs_skb_is_encapsulated bool ovs_skb_is_encapsulated(struct sk_buff *skb); +#ifndef HAVE_METADATA_DST +/* Used to memset ip_tunnel padding. */ +#define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) + +/* Used to memset ipv4 address padding. */ +#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) +#define IP_TUNNEL_KEY_IPV4_PAD_LEN \ + (FIELD_SIZEOF(struct ip_tunnel_key, u) - \ + FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4)) + +struct ip_tunnel_key { + __be64 tun_id; + union { + struct { + __be32 src; + __be32 dst; + } ipv4; + struct { + struct in6_addr src; + struct in6_addr dst; + } ipv6; + } u; + __be16 tun_flags; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ + __be16 tp_src; + __be16 tp_dst; +}; + +/* Flags for ip_tunnel_info mode. */ +#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ +#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ + +struct ip_tunnel_info { + struct ip_tunnel_key key; + u8 options_len; + u8 mode; +}; + +static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) +{ + return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; +} + +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info) +{ + return info + 1; +} + +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ + memcpy(to, info + 1, info->options_len); +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + memcpy(ip_tunnel_info_opts(info), from, len); + info->options_len = len; +} + +static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags) +{ + key->tun_id = tun_id; + key->u.ipv4.src = saddr; + key->u.ipv4.dst = daddr; + memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, + 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); + key->tos = tos; + key->ttl = ttl; + key->tun_flags = tun_flags; + + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + key->tp_src = tp_src; + key->tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); +} + +#define ip_tunnel_collect_metadata() true + + +#define ip_tunnel rpl_ip_tunnel + +struct ip_tunnel { + struct net_device *dev; + struct net *net; /* netns for packet i/o */ + + int err_count; /* Number of arrived ICMP errors */ + unsigned long err_time; /* Time when the last ICMP error + * arrived + */ + + /* These four fields used only by GRE */ + u32 i_seqno; /* The last seen seqno */ + u32 o_seqno; /* The last output seqno */ + int tun_hlen; /* Precalculated header length */ + int mlink; + + struct ip_tunnel_parm parms; + + int encap_hlen; /* Encap header length (FOU,GUE) */ + int hlen; /* tun_hlen + encap_hlen */ + + int ip_tnl_net_id; + bool collect_md; +}; + +#define ip_tunnel_net rpl_ip_tunnel_net +struct ip_tunnel_net { + struct ip_tunnel __rcu *collect_md_tun; + struct rtnl_link_ops *rtnl_ops; +}; + + +#ifndef HAVE_PCPU_SW_NETSTATS +#define ip_tunnel_get_stats64 rpl_ip_tunnel_get_stats64 +struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot); +#endif + +#define ip_tunnel_get_dsfield rpl_ip_tunnel_get_dsfield +static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, + const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ipv6_get_dsfield((const struct ipv6hdr *)iph); + else + return 0; +} + +#define ip_tunnel_ecn_encap rpl_ip_tunnel_ecn_encap +static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, + const struct sk_buff *skb) +{ + u8 inner = ip_tunnel_get_dsfield(iph, skb); + + return INET_ECN_encapsulate(tos, inner); +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) +#define iptunnel_xmit_stats(err, stats, dummy) \ +do { \ + if (err > 0) { \ + (stats)->tx_bytes += err; \ + (stats)->tx_packets++; \ + } else if (err < 0) { \ + (stats)->tx_errors++; \ + (stats)->tx_aborted_errors++; \ + } else { \ + (stats)->tx_dropped++; \ + } \ +} while (0) + +#else +#define iptunnel_xmit_stats rpl_iptunnel_xmit_stats +static inline void iptunnel_xmit_stats(int err, + struct net_device_stats *err_stats, + struct pcpu_sw_netstats __percpu *stats) +{ + if (err > 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(stats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += err; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else if (err < 0) { + err_stats->tx_errors++; + err_stats->tx_aborted_errors++; + } else { + err_stats->tx_dropped++; + } +} +#endif + +#define ip_tunnel_init rpl_ip_tunnel_init +int rpl_ip_tunnel_init(struct net_device *dev); + +#define ip_tunnel_uninit rpl_ip_tunnel_uninit +void rpl_ip_tunnel_uninit(struct net_device *dev); + +#define ip_tunnel_change_mtu rpl_ip_tunnel_change_mtu +int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); + +#define ip_tunnel_newlink rpl_ip_tunnel_newlink +int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p); + +#define ip_tunnel_dellink rpl_ip_tunnel_dellink +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head); +#else +void rpl_ip_tunnel_dellink(struct net_device *dev); +#endif + +#define ip_tunnel_init_net rpl_ip_tunnel_init_net +int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, + struct rtnl_link_ops *ops, char *devname); + +#define ip_tunnel_delete_net rpl_ip_tunnel_delete_net +void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); + +#define ip_tunnel_setup rpl_ip_tunnel_setup +void rpl_ip_tunnel_setup(struct net_device *dev, int net_id); + +#endif /* HAVE_METADATA_DST */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/datapath/linux/compat/include/net/lisp.h b/datapath/linux/compat/include/net/lisp.h new file mode 100644 index 000000000..b8af17dbb --- /dev/null +++ b/datapath/linux/compat/include/net/lisp.h @@ -0,0 +1,24 @@ +#ifndef __NET_LISP_WRAPPER_H +#define __NET_LISP_WRAPPER_H 1 + +#ifdef CONFIG_INET +#include <net/udp_tunnel.h> +#endif + + +#ifdef CONFIG_INET +#define lisp_dev_create_fb rpl_lisp_dev_create_fb +struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port); +#endif /*ifdef CONFIG_INET */ + +#define lisp_init_module rpl_lisp_init_module +int rpl_lisp_init_module(void); + +#define lisp_cleanup_module rpl_lisp_cleanup_module +void rpl_lisp_cleanup_module(void); + +#define lisp_xmit rpl_lisp_xmit +netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb); + +#endif /*ifdef__NET_LISP_H */ diff --git a/datapath/linux/compat/include/net/net_namespace.h b/datapath/linux/compat/include/net/net_namespace.h index edfa131d9..9f5087216 100644 --- a/datapath/linux/compat/include/net/net_namespace.h +++ b/datapath/linux/compat/include/net/net_namespace.h @@ -17,6 +17,9 @@ struct rpl_pernet_operations { #define register_pernet_device rpl_register_pernet_gen_device #define unregister_pernet_device rpl_unregister_pernet_gen_device +#define register_pernet_subsys rpl_register_pernet_gen_device +#define unregister_pernet_subsys rpl_unregister_pernet_gen_device + #define compat_init_net ovs_compat_init_net int ovs_compat_init_net(struct net *net, struct rpl_pernet_operations *pnet); #define compat_exit_net ovs_compat_exit_net @@ -51,7 +54,7 @@ static void rpl_unregister_pernet_gen_##TYPE(struct rpl_pernet_operations *rpl_p #define DEFINE_COMPAT_PNET_REG_FUNC(TYPE) #endif /* 2.6.33 */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) +#ifndef HAVE_POSSIBLE_NET_T typedef struct { #ifdef CONFIG_NET_NS struct net *net; diff --git a/datapath/linux/compat/include/net/route.h b/datapath/linux/compat/include/net/route.h new file mode 100644 index 000000000..bfabdc1a8 --- /dev/null +++ b/datapath/linux/compat/include/net/route.h @@ -0,0 +1,109 @@ +#ifndef __NET_ROUTE_H_WRAPPER +#define __NET_ROUTE_H_WRAPPER + +#include_next <net/route.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) +struct flowi_common { + int flowic_oif; + __u32 flowic_mark; + __u8 flowic_tos; + __u8 flowic_proto; +}; + +union flowi_uli { + struct { + __be16 dport; + __be16 sport; + } ports; + + struct { + __u8 type; + __u8 code; + } icmpt; + + struct { + __le16 dport; + __le16 sport; + } dnports; + + __be32 spi; + __be32 gre_key; + + struct { + __u8 type; + } mht; +}; + +struct flowi4 { + struct flowi_common __fl_common; +#define flowi4_oif __fl_common.flowic_oif +#define flowi4_iif __fl_common.flowic_iif +#define flowi4_mark __fl_common.flowic_mark +#define flowi4_tos __fl_common.flowic_tos +#define flowi4_scope __fl_common.flowic_scope +#define flowi4_proto __fl_common.flowic_proto +#define flowi4_flags __fl_common.flowic_flags +#define flowi4_secid __fl_common.flowic_secid +#define flowi4_tun_key __fl_common.flowic_tun_key + + union flowi_uli uli; +#define fl4_gre_key uli.gre_key + + /* (saddr,daddr) must be grouped, same order as in IP header */ + __be32 saddr; + __be32 daddr; + +} __attribute__((__aligned__(BITS_PER_LONG/8))); + +struct flowi6 { + struct flowi_common __fl_common; +#define flowi6_oif __fl_common.flowic_oif +#define flowi6_iif __fl_common.flowic_iif +#define flowi6_mark __fl_common.flowic_mark +#define flowi6_tos __fl_common.flowic_tos +#define flowi6_scope __fl_common.flowic_scope +#define flowi6_proto __fl_common.flowic_proto +#define flowi6_flags __fl_common.flowic_flags +#define flowi6_secid __fl_common.flowic_secid +#define flowi6_tun_key __fl_common.flowic_tun_key + struct in6_addr daddr; + struct in6_addr saddr; + __be32 flowlabel; + union flowi_uli uli; +#define fl6_sport uli.ports.sport +#define fl6_dport uli.ports.dport +#define fl6_icmp_type uli.icmpt.type +#define fl6_icmp_code uli.icmpt.code +#define fl6_ipsec_spi uli.spi +#define fl6_mh_type uli.mht.type +#define fl6_gre_key uli.gre_key +} __attribute__((__aligned__(BITS_PER_LONG/8))); + +static inline struct rtable *rpl_ip_route_output_key(struct net *net, struct flowi4 *flp) +{ + struct rtable *rt; + /* Tunnel configuration keeps DSCP part of TOS bits, But Linux + * router expect RT_TOS bits only. + */ + + struct flowi fl = { .nl_u = { .ip4_u = { + .daddr = flp->daddr, + .saddr = flp->saddr, + .tos = RT_TOS(flp->flowi4_tos) } }, + .mark = flp->flowi4_mark, + .proto = flp->flowi4_proto }; + + if (unlikely(ip_route_output_key(net, &rt, &fl))) + return ERR_PTR(-EADDRNOTAVAIL); + flp->saddr = fl.nl_u.ip4_u.saddr; + return rt; +} +#define ip_route_output_key rpl_ip_route_output_key + +static inline int ip4_dst_hoplimit(const struct dst_entry *dst) +{ + return dst_metric(dst, RTAX_HOPLIMIT); +} +#endif +#endif diff --git a/datapath/linux/compat/include/net/rtnetlink.h b/datapath/linux/compat/include/net/rtnetlink.h new file mode 100644 index 000000000..6db4a76ab --- /dev/null +++ b/datapath/linux/compat/include/net/rtnetlink.h @@ -0,0 +1,30 @@ +#ifndef __NET_RTNETLINK_WRAPPER_H +#define __NET_RTNETLINK_WRAPPER_H +#include_next <net/rtnetlink.h> + +#define rtnl_delete_link rpl_rtnl_delete_link +int rpl_rtnl_delete_link(struct net_device *dev); + +#ifndef HAVE_NAME_ASSIGN_TYPE +#ifdef HAVE_RTNL_CREATE_LINK_SRC_NET +static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[]) +{ + return rtnl_create_link(net, net, (char *)ifname, ops, tb); +} + +#else +static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[]) +{ + return rtnl_create_link(net, (char *)ifname, ops, tb); +} +#endif +#define rtnl_create_link rpl_rtnl_create_link +#endif + +#endif diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h index 13812b1f2..28d4dc53c 100644 --- a/datapath/linux/compat/include/net/stt.h +++ b/datapath/linux/compat/include/net/stt.h @@ -2,6 +2,7 @@ #define __NET_STT_H 1 #include <linux/kconfig.h> +#include <linux/errno.h> #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER) #include <net/ip_tunnels.h> #define OVS_STT @@ -30,42 +31,37 @@ static inline struct stthdr *stt_hdr(const struct sk_buff *skb) sizeof(struct tcphdr)); } -struct stt_sock; -typedef void (stt_rcv_t)(struct stt_sock *stt_sock, struct sk_buff *skb); +struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port); -/* @list: Per-net list of STT ports. - * @rcv: The callback is called on STT packet recv, STT reassembly can generate - * multiple packets, in this case first packet has tunnel outer header, rest - * of the packets are inner packet segments with no stt header. - * @rcv_data: user data. - * @sock: Fake TCP socket for the STT port. - */ -struct stt_sock { - struct list_head list; - stt_rcv_t *rcv; - void *rcv_data; - struct socket *sock; - struct rcu_head rcu; -}; +netdev_tx_t ovs_stt_xmit(struct sk_buff *skb); -#define stt_sock_add rpl_stt_sock_add -struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port, - stt_rcv_t *rcv, void *data); +int ovs_stt_init_module(void); -#define stt_sock_release rpl_stt_sock_release -void rpl_stt_sock_release(struct stt_sock *stt_sock); +void ovs_stt_cleanup_module(void); +#else +static inline int ovs_stt_init_module(void) +{ + return 0; +} -#define stt_xmit_skb rpl_stt_xmit_skb -int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, - __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be64 tun_id); +static inline void ovs_stt_cleanup_module(void) +{} -#define stt_init_module ovs_stt_init_module -int ovs_stt_init_module(void); +static inline struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline netdev_tx_t ovs_stt_xmit(struct sk_buff *skb) +{ + BUG(); + return NETDEV_TX_OK; +} +#endif +#define stt_dev_create_fb ovs_stt_dev_create_fb +#define stt_init_module ovs_stt_init_module #define stt_cleanup_module ovs_stt_cleanup_module -void ovs_stt_cleanup_module(void); -#endif #endif /*ifdef__NET_STT_H */ diff --git a/datapath/linux/compat/include/net/udp_tunnel.h b/datapath/linux/compat/include/net/udp_tunnel.h index d33474648..85aed9809 100644 --- a/datapath/linux/compat/include/net/udp_tunnel.h +++ b/datapath/linux/compat/include/net/udp_tunnel.h @@ -4,12 +4,14 @@ #include <linux/version.h> #include <linux/kconfig.h> -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) +#include <net/dst_metadata.h> +#include <linux/netdev_features.h> +#ifdef HAVE_UDP_TUNNEL_IPV6 #include_next <net/udp_tunnel.h> static inline struct sk_buff * rpl_udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum, - bool is_vxlan) + int type, bool is_vxlan) { if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { kfree_skb(skb); @@ -19,18 +21,6 @@ rpl_udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum, } #define udp_tunnel_handle_offloads rpl_udp_tunnel_handle_offloads -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) -static inline int rpl_udp_tunnel_xmit_skb(struct rtable *rt, - struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) -{ - return udp_tunnel_xmit_skb(rt, skb, src, dst, tos, ttl, df, src_port, - dst_port, xnet, nocheck); -} -#define udp_tunnel_xmit_skb rpl_udp_tunnel_xmit_skb -#endif #else #include <net/ip_tunnels.h> @@ -58,7 +48,8 @@ struct udp_port_cfg { __be16 peer_udp_port; unsigned int use_udp_checksums:1, use_udp6_tx_checksums:1, - use_udp6_rx_checksums:1; + use_udp6_rx_checksums:1, + ipv6_v6only:1; }; #define udp_sock_create rpl_udp_sock_create @@ -96,13 +87,20 @@ void rpl_udp_tunnel_sock_release(struct socket *sock); void ovs_udp_gso(struct sk_buff *skb); void ovs_udp_csum_gso(struct sk_buff *skb); +#define udp_tunnel_encap_enable(sock) udp_encap_enable() static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum, + int type, bool is_vxlan) { - int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; void (*fix_segment)(struct sk_buff *); + if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { + kfree_skb(skb); + return ERR_PTR(-ENOSYS); + } + + type |= udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; if (!udp_csum) fix_segment = ovs_udp_gso; else @@ -116,7 +114,38 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, return ovs_iptunnel_handle_offloads(skb, udp_csum, type, fix_segment); } -#define udp_tunnel_encap_enable(sock) udp_encap_enable() +#if IS_ENABLED(CONFIG_IPV6) +#define udp_tunnel6_xmit_skb rpl_udp_tunnel6_xmit_skb +int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, + __be16 dst_port, bool nocheck); +#endif + +static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udphdr *uh; + + uh = (struct udphdr *)(skb->data + nhoff - sizeof(struct udphdr)); + skb_shinfo(skb)->gso_type |= uh->check ? + SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; +} +#endif + +static inline void ovs_udp_tun_rx_dst(struct ip_tunnel_info *info, + struct sk_buff *skb, + unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + if (family == AF_INET) + ovs_ip_tun_rx_dst(info, skb, flags, tunnel_id, md_size); + + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; +} -#endif /* Linux version < 4.0 */ #endif diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h index cafff7954..13de97ac1 100644 --- a/datapath/linux/compat/include/net/vxlan.h +++ b/datapath/linux/compat/include/net/vxlan.h @@ -1,32 +1,39 @@ #ifndef __NET_VXLAN_WRAPPER_H -#define __NET_VXLAN_WRAPPER_H 1 +#define __NET_VXLAN_WRAPPER_H 1 -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/udp.h> -#include <net/gre.h> - -#include <linux/version.h> +#ifdef CONFIG_INET +#include <net/udp_tunnel.h> +#endif -#ifdef HAVE_VXLAN_METADATA -#define USE_UPSTREAM_VXLAN +#ifdef HAVE_METADATA_DST #include_next <net/vxlan.h> -#endif -#ifndef VXLAN_HLEN -/* VXLAN header flags. */ -#define VXLAN_HF_VNI 0x08000000 -#ifndef VXLAN_HF_GBP -#define VXLAN_HF_GBP 0x80000000 -#endif +static inline int rpl_vxlan_init_module(void) +{ + return 0; +} +static inline void rpl_vxlan_cleanup_module(void) +{} -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) -#endif +#define vxlan_xmit dev_queue_xmit + +#else + +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/udp.h> +#include <net/dst_metadata.h> + +#include "compat.h" +#include "gso.h" + +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) -#ifndef VXLAN_GBP_USED_BITS /* * VXLAN Group Based Policy Extension: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -64,6 +71,7 @@ struct vxlanhdr_gbp { __be16 policy_id; __be32 vx_vni; }; + #define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF) /* skb->mark mapping @@ -76,75 +84,194 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) -#define VXLAN_F_GBP 0x800 -#endif - -#ifndef VXLAN_F_UDP_CSUM -#define VXLAN_F_UDP_CSUM 0x40 -#endif - -#ifndef VXLAN_F_RCV_FLAGS -#define VXLAN_F_RCV_FLAGS VXLAN_F_GBP -#endif +/* VXLAN protocol header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |G|R|R|R|I|R|R|C| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * G = 1 Group Policy (VXLAN-GBP) + * I = 1 VXLAN Network Identifier (VNI) present + * C = 1 Remote checksum offload (RCO) + */ +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +}; -#ifdef USE_UPSTREAM_VXLAN -static inline int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) -{ - if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { - kfree_skb(skb); - return -ENOSYS; - } +/* VXLAN header flags. */ +#define VXLAN_HF_RCO BIT(24) +#define VXLAN_HF_VNI BIT(27) +#define VXLAN_HF_GBP BIT(31) -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) - return vxlan_xmit_skb(rt, skb, src, dst, tos, ttl, df, -#else - return vxlan_xmit_skb(rt, sk, skb, src, dst, tos, ttl, df, -#endif - src_port, dst_port, md, xnet, vxflags); -} +/* Remote checksum offload header option */ +#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ +#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ +#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ +#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) +#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) -#define vxlan_xmit_skb rpl_vxlan_xmit_skb -#else /* USE_UPSTREAM_VXLAN */ +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VID_MASK (VXLAN_N_VID - 1) +#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) struct vxlan_metadata { - __be32 vni; - u32 gbp; + __be32 vni; + u32 gbp; }; -#define vxlan_sock rpl_vxlan_sock -struct rpl_vxlan_sock; - -#define vxlan_rcv_t rpl_vxlan_rcv_t -typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, - struct vxlan_metadata *md); +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) +#define FDB_HASH_BITS 8 +#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; - vxlan_rcv_t *rcv; - void *data; struct work_struct del_work; struct socket *sock; struct rcu_head rcu; + struct hlist_head vni_list[VNI_HASH_SIZE]; + atomic_t refcnt; +#ifdef HAVE_UDP_OFFLOAD + struct udp_offload udp_offloads; +#endif u32 flags; }; -#define vxlan_sock_add rpl_vxlan_sock_add -struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags); +union vxlan_addr { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr sa; +}; + +struct vxlan_rdst { + union vxlan_addr remote_ip; + __be16 remote_port; + u32 remote_vni; + u32 remote_ifindex; + struct list_head list; + struct rcu_head rcu; +}; + +struct vxlan_config { + union vxlan_addr remote_ip; + union vxlan_addr saddr; + u32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + __u16 port_min; + __u16 port_max; + __u8 tos; + __u8 ttl; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; +}; + +/* Pseudo network device */ +struct vxlan_dev { + struct hlist_node hlist; /* vni hash table */ + struct list_head next; /* vxlan's per namespace list */ + struct vxlan_sock *vn_sock; /* listening socket */ + struct net_device *dev; + struct net *net; /* netns for packet i/o */ + struct vxlan_rdst default_dst; /* default destination */ + u32 flags; /* VXLAN_F_* in vxlan.h */ + + struct timer_list age_timer; + spinlock_t hash_lock; + unsigned int addrcnt; -#define vxlan_sock_release rpl_vxlan_sock_release -void rpl_vxlan_sock_release(struct vxlan_sock *vs); + struct vxlan_config cfg; + struct hlist_head fdb_head[FDB_HASH_SIZE]; +}; + +#define VXLAN_F_LEARN 0x01 +#define VXLAN_F_PROXY 0x02 +#define VXLAN_F_RSC 0x04 +#define VXLAN_F_L2MISS 0x08 +#define VXLAN_F_L3MISS 0x10 +#define VXLAN_F_IPV6 0x20 +#define VXLAN_F_UDP_CSUM 0x40 +#define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 +#define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 +#define VXLAN_F_REMCSUM_TX 0x200 +#define VXLAN_F_REMCSUM_RX 0x400 +#define VXLAN_F_GBP 0x800 +#define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 +#define VXLAN_F_COLLECT_METADATA 0x2000 + +/* Flags that are used in the receive path. These flags must match in + * order for a socket to be shareable + */ +#define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ + VXLAN_F_UDP_ZERO_CSUM6_RX | \ + VXLAN_F_REMCSUM_RX | \ + VXLAN_F_REMCSUM_NOPARTIAL | \ + VXLAN_F_COLLECT_METADATA) +#define vxlan_dev_create rpl_vxlan_dev_create +struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, + u8 name_assign_type, struct vxlan_config *conf); + +static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan) +{ + return inet_sport(vxlan->vn_sock->sock->sk); +} + +static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, + netdev_features_t features) +{ + u8 l4_hdr = 0; + + if (!skb_encapsulation(skb)) + return features; + + switch (vlan_get_protocol(skb)) { + case htons(ETH_P_IP): + l4_hdr = ip_hdr(skb)->protocol; + break; + case htons(ETH_P_IPV6): + l4_hdr = ipv6_hdr(skb)->nexthdr; + break; + default: + return features; + } + + if ((l4_hdr == IPPROTO_UDP) && ( +#ifdef ENCAP_TYPE_ETHER + skb->inner_protocol_type != ENCAP_TYPE_ETHER || +#endif + ovs_skb_get_inner_protocol(skb) != htons(ETH_P_TEB) || + (skb_inner_mac_header(skb) - skb_transport_header(skb) != + sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) + return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + + return features; +} + +/* IP header + UDP + VXLAN + Ethernet header */ +#define VXLAN_HEADROOM (20 + 8 + 8 + 14) +/* IPv6 header + UDP + VXLAN + Ethernet header */ +#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) + +static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) +{ + return vs->sock->sk->sk_family; +} + +int rpl_vxlan_init_module(void); +void rpl_vxlan_cleanup_module(void); + +#define vxlan_xmit rpl_vxlan_xmit +netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb); +#endif -#define vxlan_xmit_skb rpl_vxlan_xmit_skb -int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags); +#define vxlan_init_module rpl_vxlan_init_module +#define vxlan_cleanup_module rpl_vxlan_cleanup_module -#endif /* !HAVE_VXLAN_METADATA */ #endif diff --git a/datapath/linux/compat/ip_gre.c b/datapath/linux/compat/ip_gre.c new file mode 100644 index 000000000..c9197e965 --- /dev/null +++ b/datapath/linux/compat/ip_gre.c @@ -0,0 +1,680 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/kconfig.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/netdev_features.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/if_vlan.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> +#include <net/gre.h> +#include <net/dst_metadata.h> + +#ifndef HAVE_METADATA_DST +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +#include "gso.h" +#include "vport-netdev.h" + +static int gre_tap_net_id __read_mostly; + +#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen +static int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + +#define tnl_flags_to_gre_flags rpl_tnl_flags_to_gre_flags +static __be16 tnl_flags_to_gre_flags(__be16 tflags) +{ + __be16 flags = 0; + + if (tflags & TUNNEL_CSUM) + flags |= GRE_CSUM; + if (tflags & TUNNEL_ROUTING) + flags |= GRE_ROUTING; + if (tflags & TUNNEL_KEY) + flags |= GRE_KEY; + if (tflags & TUNNEL_SEQ) + flags |= GRE_SEQ; + if (tflags & TUNNEL_STRICT) + flags |= GRE_STRICT; + if (tflags & TUNNEL_REC) + flags |= GRE_REC; + if (tflags & TUNNEL_VERSION) + flags |= GRE_VERSION; + + return flags; +} + +static __be64 key_to_tunnel_id(__be32 key) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u32)key); +#else + return (__force __be64)((__force u64)key << 32); +#endif +} + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 tunnel_id_to_key(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif +} + +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) +{ + struct net *net = dev_net(skb->dev); + struct metadata_dst tun_dst; + struct ip_tunnel_net *itn; + const struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (tpi->proto != htons(ETH_P_TEB)) + return PACKET_REJECT; + + itn = net_generic(net, gre_tap_net_id); + + iph = ip_hdr(skb); + tunnel = rcu_dereference(itn->collect_md_tun); + if (tunnel) { + __be16 flags; + __be64 tun_id; + int err; + + + skb_pop_mac_header(skb); + flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + tun_id = key_to_tunnel_id(tpi->key); + ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, flags, tun_id, 0); + + skb_reset_network_header(skb); + err = IP_ECN_decapsulate(iph, skb); + if (unlikely(err)) { + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + return PACKET_REJECT; + } + } + + ovs_ip_tunnel_rcv(tunnel->dev, skb, &tun_dst); + return PACKET_RCVD; + } + return PACKET_REJECT; +} + +static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) +{ + if (ipgre_rcv(skb, tpi) == PACKET_RCVD) + return 0; + + kfree_skb(skb); + return 0; +} + +#ifndef HAVE_GRE_HANDLE_OFFLOADS +static void gre_nop_fix(struct sk_buff *skb) { } + +static void gre_csum_fix(struct sk_buff *skb) +{ + struct gre_base_hdr *greh; + __be32 *options; + int gre_offset = skb_transport_offset(skb); + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + options = ((__be32 *)greh + 1); + + *options = 0; + *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset, + skb->len - gre_offset, 0)); +} + +static bool is_gre_gso(struct sk_buff *skb) +{ + return skb_is_gso(skb); +} + +static struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; + gso_fix_segment_t fix_segment; + + if (gre_csum) + fix_segment = gre_csum_fix; + else + fix_segment = gre_nop_fix; + + return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment); +} +#else + +static bool is_gre_gso(struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE | SKB_GSO_GRE_CSUM); +} + +static struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { + kfree_skb(skb); + return ERR_PTR(-ENOSYS); + } +#undef gre_handle_offloads + return gre_handle_offloads(skb, gre_csum); +} +#endif + +static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, + __be16 proto, __be32 key, __be32 seq) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(flags); + greh->protocol = proto; + + if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (flags & TUNNEL_SEQ) { + *ptr = seq; + ptr--; + } + if (flags & TUNNEL_KEY) { + *ptr = key; + ptr--; + } + if (flags & TUNNEL_CSUM && !is_gre_gso(skb)) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } + ovs_skb_set_inner_protocol(skb, proto); +} + + +netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df, flags; + int err; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + memset(&fl, 0, sizeof(fl)); + fl.daddr = key->u.ipv4.dst; + fl.saddr = key->u.ipv4.src; + fl.flowi4_tos = RT_TOS(key->tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_GRE; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + goto err_free_skb; + + tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + + tunnel_hlen + sizeof(struct iphdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + err = -ENOMEM; + goto err_free_rt; + } + + /* Push Tunnel header. */ + skb = rpl_gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) { + skb = NULL; + goto err_free_rt; + } + + flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + tunnel_id_to_key(tun_info->key.tun_id), 0); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, + key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); + iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); + return NETDEV_TX_OK; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} +EXPORT_SYMBOL(rpl_gre_fb_xmit); + +#define GRE_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM | \ + NETIF_F_NETNS_LOCAL) + +static void __gre_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel; + int t_hlen; + + tunnel = netdev_priv(dev); + tunnel->parms.iph.protocol = IPPROTO_GRE; + tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct iphdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; + + dev->features |= GRE_FEATURES; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + dev->hw_features |= GRE_FEATURES; +#endif + + if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported. */ + dev->features |= NETIF_F_GSO_SOFTWARE; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + dev->hw_features |= NETIF_F_GSO_SOFTWARE; +#endif + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) +{ + return PACKET_REJECT; +} + +static struct gre_cisco_protocol ipgre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .priority = 1, +}; + +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be32 daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); + if (!daddr) + return -EINVAL; + } + +out: + return ipgre_tunnel_validate(tb, data); +} + +static void ipgre_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.protocol = IPPROTO_GRE; +} + +static int gre_tap_init(struct net_device *dev) +{ + __gre_tunnel_init(dev); + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + return ip_tunnel_init(dev); +} + +static netdev_tx_t gre_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ + + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + +static const struct net_device_ops gre_tap_netdev_ops = { + .ndo_init = gre_tap_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = gre_dev_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip_tunnel_change_mtu, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + .ndo_get_stats64 = ip_tunnel_get_stats64, +#endif +#ifdef HAVE_NDO_GET_IFLINK + .ndo_get_iflink = ip_tunnel_get_iflink, +#endif +}; + +static void ipgre_tap_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + ip_tunnel_setup(dev, gre_tap_net_id); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +#else +static int ipgre_newlink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +#endif +{ + struct ip_tunnel_parm p; + int err; + + ipgre_netlink_parms(dev, data, tb, &p); + err = ip_tunnel_newlink(dev, tb, &p); + return err; + +} + +static size_t ipgre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(4) + + /* IFLA_GRE_REMOTE */ + nla_total_size(4) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_PMTUDISC */ + nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + + 0; +} + +static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm *p = &t->parms; + + if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || + nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || + nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || + nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || + nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || + nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || + nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || + nla_put_u8(skb, IFLA_GRE_PMTUDISC, + !!(p->iph.frag_off & htons(IP_DF)))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_TOS] = { .type = NLA_U8 }, + [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { + .kind = "ovs_gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipgre_tap_setup, + .validate = ipgre_tap_validate, + .newlink = ipgre_newlink, + .dellink = ip_tunnel_dellink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, +#ifdef HAVE_GET_LINK_NET + .get_link_net = ip_tunnel_get_link_net, +#endif +}; + +struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ip_tunnel *t; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, (char *)name, name_assign_type, + &ipgre_tap_ops, tb); + if (IS_ERR(dev)) + return dev; + + t = netdev_priv(dev); + t->collect_md = true; + /* Configure flow based GRE device. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + err = ipgre_newlink(net, dev, tb, NULL); +#else + err = ipgre_newlink(dev, tb, NULL); +#endif + if (err < 0) + goto out; + return dev; +out: + free_netdev(dev); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(rpl_gretap_fb_dev_create); + +static int __net_init ipgre_tap_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); +} + +static void __net_exit ipgre_tap_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); + + ip_tunnel_delete_net(itn, &ipgre_tap_ops); +} + +static struct pernet_operations ipgre_tap_net_ops = { + .init = ipgre_tap_init_net, + .exit = ipgre_tap_exit_net, + .id = &gre_tap_net_id, + .size = sizeof(struct ip_tunnel_net), +}; + +DEFINE_COMPAT_PNET_REG_FUNC(device); + +int rpl_ipgre_init(void) +{ + int err; + + err = register_pernet_device(&ipgre_tap_net_ops); + if (err < 0) + goto pnet_tap_faied; + + err = gre_cisco_register(&ipgre_protocol); + if (err < 0) { + pr_info("%s: can't add protocol\n", __func__); + goto add_proto_failed; + } + + err = rtnl_link_register(&ipgre_tap_ops); + if (err < 0) + goto tap_ops_failed; + + pr_info("GRE over IPv4 tunneling driver\n"); + return 0; + +tap_ops_failed: + gre_cisco_unregister(&ipgre_protocol); +add_proto_failed: + unregister_pernet_device(&ipgre_tap_net_ops); +pnet_tap_faied: + return err; +} + +void rpl_ipgre_fini(void) +{ + rtnl_link_unregister(&ipgre_tap_ops); + gre_cisco_unregister(&ipgre_protocol); + unregister_pernet_device(&ipgre_tap_net_ops); +} + +#endif diff --git a/datapath/linux/compat/ip_tunnel.c b/datapath/linux/compat/ip_tunnel.c new file mode 100644 index 000000000..f43e2d457 --- /dev/null +++ b/datapath/linux/compat/ip_tunnel.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/kconfig.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/rculist.h> +#include <linux/err.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> +#include <net/udp.h> + +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +#include "compat.h" + +#ifndef HAVE_METADATA_DST +static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) +{ + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, t); + else + WARN_ONCE(1, "%s: collect md not set\n", t->dev->name); +} + +static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) +{ + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, NULL); +} + +static inline void init_tunnel_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) +{ + memset(fl4, 0, sizeof(*fl4)); + fl4->flowi4_oif = oif; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->flowi4_tos = tos; + fl4->flowi4_proto = proto; + fl4->fl4_gre_key = key; +} + +static int ip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and needed_headroom */ + if (iph->daddr) { + struct flowi4 fl4; + struct rtable *rt; + + init_tunnel_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link); + rt = ip_route_output_key(tunnel->net, &fl4); + + if (!IS_ERR(rt)) { + tdev = rt_dst(rt).dev; + ip_rt_put(rt); + } + if (dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len + tdev->needed_headroom; + mtu = tdev->mtu; + } + + dev->needed_headroom = t_hlen + hlen; + mtu -= (dev->hard_header_len + t_hlen); + + if (mtu < 68) + mtu = 68; + + return mtu; +} + +int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen = tunnel->hlen + sizeof(struct iphdr); + + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ip_tunnel_dev_free(struct net_device *dev) +{ +#ifdef HAVE_DEV_TSTATS + free_percpu(dev->tstats); +#endif + free_netdev(dev); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head) +#else +void rpl_ip_tunnel_dellink(struct net_device *dev) +#endif +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_net *itn; + + itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); + + ip_tunnel_del(itn, netdev_priv(dev)); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + unregister_netdevice_queue(dev, head); +#endif +} + +int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, + struct rtnl_link_ops *ops, char *devname) +{ + struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); + + itn->collect_md_tun = NULL; + itn->rtnl_ops = ops; + return 0; +} + +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, + struct rtnl_link_ops *ops) +{ + struct ip_tunnel *t; + + t = rtnl_dereference(itn->collect_md_tun); + if (!t) + return; + unregister_netdevice_queue(t->dev, head); +} + +void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +{ + LIST_HEAD(list); + + rtnl_lock(); + ip_tunnel_destroy(itn, &list, ops); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], + struct ip_tunnel_parm *p) +{ + struct ip_tunnel *nt; + struct net *net = dev_net(dev); + struct ip_tunnel_net *itn; + int mtu; + int err; + + nt = netdev_priv(dev); + itn = net_generic(net, nt->ip_tnl_net_id); + + if (nt->collect_md) { + if (rtnl_dereference(itn->collect_md_tun)) + return -EEXIST; + } else { + return -EOPNOTSUPP; + } + + nt->net = net; + nt->parms = *p; + err = register_netdevice(dev); + if (err) + goto out; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + eth_hw_addr_random(dev); + + mtu = ip_tunnel_bind_dev(dev); + if (!tb[IFLA_MTU]) + dev->mtu = mtu; + + ip_tunnel_add(itn, nt); +out: + return err; +} + +int rpl_ip_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + + dev->destructor = ip_tunnel_dev_free; +#ifdef HAVE_DEV_TSTATS + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; +#endif + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + iph->version = 4; + iph->ihl = 5; + + if (tunnel->collect_md) + dev->features |= NETIF_F_NETNS_LOCAL; + + return 0; +} + +void rpl_ip_tunnel_uninit(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; + struct ip_tunnel_net *itn; + + itn = net_generic(net, tunnel->ip_tnl_net_id); + ip_tunnel_del(itn, netdev_priv(dev)); +} + +/* Do least required initialization, rest of init is done in tunnel_init call */ +void rpl_ip_tunnel_setup(struct net_device *dev, int net_id) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + tunnel->ip_tnl_net_id = net_id; +} +#endif diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c index 8ff7cd79f..179fa47b2 100644 --- a/datapath/linux/compat/ip_tunnels_core.c +++ b/datapath/linux/compat/ip_tunnels_core.c @@ -34,6 +34,7 @@ #include "compat.h" #include "gso.h" +#include "vport-netdev.h" #if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, @@ -44,11 +45,11 @@ int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct iphdr *iph; int err; - nf_reset(skb); - secpath_reset(skb); + skb_scrub_packet(skb, xnet); + skb_clear_hash(skb); - skb_dst_drop(skb); skb_dst_set(skb, &rt_dst(rt)); + #if 0 /* Do not clear ovs_skb_cb. It will be done in gso code. */ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -71,6 +72,9 @@ int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, #ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY __ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1); +#elif defined(HAVE_IP_SELECT_IDENT_USING_NET) + __ip_select_ident(dev_net(rt->dst.dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); #else __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); #endif @@ -84,7 +88,7 @@ EXPORT_SYMBOL_GPL(rpl_iptunnel_xmit); struct sk_buff *ovs_iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask, - void (*fix_segment)(struct sk_buff *)) + void (*fix_segment)(struct sk_buff *)) { int err; @@ -180,3 +184,84 @@ bool ovs_skb_is_encapsulated(struct sk_buff *skb) return ovs_skb_get_inner_protocol(skb) || skb_encapsulation(skb); } EXPORT_SYMBOL_GPL(ovs_skb_is_encapsulated); + +/* derived from ip_tunnel_rcv(). */ +void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb, + struct metadata_dst *tun_dst) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + struct pcpu_sw_netstats *tstats; + + tstats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); +#endif + + skb_reset_mac_header(skb); + skb_scrub_packet(skb, false); + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); + +#ifndef HAVE_METADATA_DST + netdev_port_receive(skb, &tun_dst->u.tun_info); +#else + netif_rx(skb); +#endif +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +#ifndef HAVE_PCPU_SW_NETSTATS +#define netdev_stats_to_stats64 rpl_netdev_stats_to_stats64 +static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 + BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else + size_t i, n = sizeof(*stats64) / sizeof(u64); + const unsigned long *src = (const unsigned long *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != + sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = src[i]; +#endif +} + +struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) +{ + int i; + + netdev_stats_to_stats64(tot, &dev->stats); + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats, i); + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + unsigned int start; + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + rx_packets = tstats->rx_packets; + tx_packets = tstats->tx_packets; + rx_bytes = tstats->rx_bytes; + tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + tot->rx_packets += rx_packets; + tot->tx_packets += tx_packets; + tot->rx_bytes += rx_bytes; + tot->tx_bytes += tx_bytes; + } + + return tot; +} +#endif +#endif diff --git a/datapath/linux/compat/lisp.c b/datapath/linux/compat/lisp.c new file mode 100644 index 000000000..e5a6a7fe0 --- /dev/null +++ b/datapath/linux/compat/lisp.c @@ -0,0 +1,711 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/version.h> + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/module.h> +#include <linux/rculist.h> +#include <linux/udp.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/lisp.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> +#include <net/xfrm.h> + +#include "datapath.h" +#include "gso.h" +#include "vport.h" +#include "gso.h" +#include "vport-netdev.h" + +#define LISP_UDP_PORT 4341 +#define LISP_NETDEV_VER "0.1" +static int lisp_net_id; + +/* Pseudo network device */ +struct lisp_dev { + struct net *net; /* netns for packet i/o */ + struct net_device *dev; /* netdev for lisp tunnel */ + struct socket *sock; + __be16 dst_port; + struct list_head next; +}; + +/* per-network namespace private data for this module */ +struct lisp_net { + struct list_head lisp_list; +}; + +/* + * LISP encapsulation header: + * + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |N|L|E|V|I|flags| Nonce/Map-Version | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Instance ID/Locator Status Bits | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ + +/** + * struct lisphdr - LISP header + * @nonce_present: Flag indicating the presence of a 24 bit nonce value. + * @locator_status_bits_present: Flag indicating the presence of Locator Status + * Bits (LSB). + * @solicit_echo_nonce: Flag indicating the use of the echo noncing mechanism. + * @map_version_present: Flag indicating the use of mapping versioning. + * @instance_id_present: Flag indicating the presence of a 24 bit Instance ID. + * @reserved_flags: 3 bits reserved for future flags. + * @nonce: 24 bit nonce value. + * @map_version: 24 bit mapping version. + * @locator_status_bits: Locator Status Bits: 32 bits when instance_id_present + * is not set, 8 bits when it is. + * @instance_id: 24 bit Instance ID + */ +struct lisphdr { +#ifdef __LITTLE_ENDIAN_BITFIELD + __u8 reserved_flags:3; + __u8 instance_id_present:1; + __u8 map_version_present:1; + __u8 solicit_echo_nonce:1; + __u8 locator_status_bits_present:1; + __u8 nonce_present:1; +#else + __u8 nonce_present:1; + __u8 locator_status_bits_present:1; + __u8 solicit_echo_nonce:1; + __u8 map_version_present:1; + __u8 instance_id_present:1; + __u8 reserved_flags:3; +#endif + union { + __u8 nonce[3]; + __u8 map_version[3]; + } u1; + union { + __be32 locator_status_bits; + struct { + __u8 instance_id[3]; + __u8 locator_status_bits; + } word2; + } u2; +}; + +#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr)) + +static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb) +{ + return (struct lisphdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit Instance ID. */ +static void tunnel_id_to_instance_id(__be64 tun_id, __u8 *iid) +{ + +#ifdef __BIG_ENDIAN + iid[0] = (__force __u8)(tun_id >> 16); + iid[1] = (__force __u8)(tun_id >> 8); + iid[2] = (__force __u8)tun_id; +#else + iid[0] = (__force __u8)((__force u64)tun_id >> 40); + iid[1] = (__force __u8)((__force u64)tun_id >> 48); + iid[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit Instance ID to 64 bit tunnel ID. */ +static __be64 instance_id_to_tunnel_id(__u8 *iid) +{ +#ifdef __BIG_ENDIAN + return (iid[0] << 16) | (iid[1] << 8) | iid[2]; +#else + return (__force __be64)(((__force u64)iid[0] << 40) | + ((__force u64)iid[1] << 48) | + ((__force u64)iid[2] << 56)); +#endif +} + +/* Compute source UDP port for outgoing packet. + * Currently we use the flow hash. + */ +static u16 get_src_port(struct net *net, struct sk_buff *skb) +{ + u32 hash = skb_get_hash(skb); + unsigned int range; + int high; + int low; + + if (!hash) { + if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *iph; + int size = (sizeof(iph->saddr) * 2) / sizeof(u32); + + iph = (struct iphdr *) skb_network_header(skb); + hash = jhash2((const u32 *)&iph->saddr, size, 0); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct ipv6hdr *ipv6hdr; + + ipv6hdr = (struct ipv6hdr *) skb_network_header(skb); + hash = jhash2((const u32 *)&ipv6hdr->saddr, + (sizeof(struct in6_addr) * 2) / sizeof(u32), 0); + } else { + pr_warn_once("LISP inner protocol is not IP when " + "calculating hash.\n"); + } + } + + inet_get_local_port_range(net, &low, &high); + range = (high - low) + 1; + return (((u64) hash * range) >> 32) + low; +} + +static void lisp_build_header(struct sk_buff *skb, + const struct ip_tunnel_key *tun_key) +{ + struct lisphdr *lisph; + + lisph = (struct lisphdr *)__skb_push(skb, sizeof(struct lisphdr)); + lisph->nonce_present = 0; /* We don't support echo nonce algorithm */ + lisph->locator_status_bits_present = 1; /* Set LSB */ + lisph->solicit_echo_nonce = 0; /* No echo noncing */ + lisph->map_version_present = 0; /* No mapping versioning, nonce instead */ + lisph->instance_id_present = 1; /* Store the tun_id as Instance ID */ + lisph->reserved_flags = 0; /* Reserved flags, set to 0 */ + + lisph->u1.nonce[0] = 0; + lisph->u1.nonce[1] = 0; + lisph->u1.nonce[2] = 0; + + tunnel_id_to_instance_id(tun_key->tun_id, &lisph->u2.word2.instance_id[0]); + lisph->u2.word2.locator_status_bits = 1; +} + +/* Called with rcu_read_lock and BH disabled. */ +static int lisp_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev; + struct lisphdr *lisph; + struct iphdr *inner_iph; + struct metadata_dst *tun_dst; +#ifndef HAVE_METADATA_DST + struct metadata_dst temp; +#endif + __be64 key; + struct ethhdr *ethh; + __be16 protocol; + + dev = rcu_dereference_sk_user_data(sk); + if (unlikely(!dev)) + goto error; + + if (iptunnel_pull_header(skb, LISP_HLEN, 0)) + goto error; + + lisph = lisp_hdr(skb); + + if (lisph->instance_id_present != 1) + key = 0; + else + key = instance_id_to_tunnel_id(&lisph->u2.word2.instance_id[0]); + + /* Save outer tunnel values */ +#ifndef HAVE_METADATA_DST + tun_dst = &temp; + ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, TUNNEL_KEY, key, 0); +#else + tun_dst = udp_tun_rx_dst(skb, AF_INET, TUNNEL_KEY, key, 0); +#endif + /* Drop non-IP inner packets */ + inner_iph = (struct iphdr *)(lisph + 1); + switch (inner_iph->version) { + case 4: + protocol = htons(ETH_P_IP); + break; + case 6: + protocol = htons(ETH_P_IPV6); + break; + default: + goto error; + } + skb->protocol = protocol; + + /* Add Ethernet header */ + ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN); + memset(ethh, 0, ETH_HLEN); + ethh->h_dest[0] = 0x02; + ethh->h_source[0] = 0x02; + ethh->h_proto = protocol; + + ovs_ip_tunnel_rcv(dev, skb, tun_dst); + goto out; + +error: + kfree_skb(skb); +out: + return 0; +} + +netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct lisp_dev *lisp_dev = netdev_priv(dev); + struct net *net = lisp_dev->net; + int network_offset = skb_network_offset(skb); + struct ip_tunnel_info *info; + struct ip_tunnel_key *tun_key; + struct rtable *rt; + int min_headroom; + __be16 src_port, dst_port; + struct flowi4 fl; + __be16 df; + int err; + + info = skb_tunnel_info(skb); + if (unlikely(!info)) { + err = -EINVAL; + goto error; + } + + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) { + err = 0; + goto error; + } + + tun_key = &info->key; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->u.ipv4.dst; + fl.saddr = tun_key->u.ipv4.src; + fl.flowi4_tos = RT_TOS(tun_key->tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + + sizeof(struct iphdr) + LISP_HLEN; + + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + /* Reset l2 headers. */ + skb_pull(skb, network_offset); + skb_reset_mac_header(skb); + vlan_set_tci(skb, 0); + + skb = udp_tunnel_handle_offloads(skb, false, 0, false); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + skb = NULL; + goto err_free_rt; + } + + src_port = htons(get_src_port(net, skb)); + dst_port = lisp_dev->dst_port; + + lisp_build_header(skb, tun_key); + + skb->ignore_df = 1; + + ovs_skb_set_inner_protocol(skb, skb->protocol); + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + err = udp_tunnel_xmit_skb(rt, lisp_dev->sock->sk, skb, + fl.saddr, tun_key->u.ipv4.dst, + tun_key->tos, tun_key->ttl, + df, src_port, dst_port, false, true); + + iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); + return NETDEV_TX_OK; + +err_free_rt: + ip_rt_put(rt); +error: + kfree_skb(skb); + return NETDEV_TX_OK; +} +EXPORT_SYMBOL(rpl_lisp_xmit); + +#ifdef HAVE_DEV_TSTATS +/* Setup stats when device is created */ +static int lisp_init(struct net_device *dev) +{ + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static void lisp_uninit(struct net_device *dev) +{ + free_percpu(dev->tstats); +} +#endif + +static struct socket *create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +static int lisp_open(struct net_device *dev) +{ + struct lisp_dev *lisp = netdev_priv(dev); + struct udp_tunnel_sock_cfg tunnel_cfg; + struct net *net = lisp->net; + + lisp->sock = create_sock(net, false, lisp->dst_port); + if (IS_ERR(lisp->sock)) + return PTR_ERR(lisp->sock); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = dev; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = lisp_rcv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, lisp->sock, &tunnel_cfg); + return 0; +} + +static int lisp_stop(struct net_device *dev) +{ + struct lisp_dev *lisp = netdev_priv(dev); + + udp_tunnel_sock_release(lisp->sock); + lisp->sock = NULL; + return 0; +} + +static netdev_tx_t lisp_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ +#ifdef HAVE_METADATA_DST + return rpl_lisp_xmit(skb); +#else + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ + + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +#endif +} + +static const struct net_device_ops lisp_netdev_ops = { +#ifdef HAVE_DEV_TSTATS + .ndo_init = lisp_init, + .ndo_uninit = lisp_uninit, + .ndo_get_stats64 = ip_tunnel_get_stats64, +#endif + .ndo_open = lisp_open, + .ndo_stop = lisp_stop, + .ndo_start_xmit = lisp_dev_xmit, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, +}; + +static void lisp_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->version, LISP_NETDEV_VER, sizeof(drvinfo->version)); + strlcpy(drvinfo->driver, "lisp", sizeof(drvinfo->driver)); +} + +static const struct ethtool_ops lisp_ethtool_ops = { + .get_drvinfo = lisp_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +/* Info for udev, that this is a virtual tunnel endpoint */ +static struct device_type lisp_type = { + .name = "lisp", +}; + +/* Initialize the device structure. */ +static void lisp_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->netdev_ops = &lisp_netdev_ops; + dev->ethtool_ops = &lisp_ethtool_ops; + dev->destructor = free_netdev; + + SET_NETDEV_DEVTYPE(dev, &lisp_type); + + dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_GSO_SOFTWARE; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; +#endif +#ifdef HAVE_METADATA_DST + netif_keep_dst(dev); +#endif + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + eth_hw_addr_random(dev); +} + +static const struct nla_policy lisp_policy[IFLA_LISP_MAX + 1] = { + [IFLA_LISP_PORT] = { .type = NLA_U16 }, +}; + +static int lisp_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + return 0; +} + +static struct lisp_dev *find_dev(struct net *net, __be16 dst_port) +{ + struct lisp_net *ln = net_generic(net, lisp_net_id); + struct lisp_dev *dev; + + list_for_each_entry(dev, &ln->lisp_list, next) { + if (dev->dst_port == dst_port) + return dev; + } + return NULL; +} + +static int lisp_configure(struct net *net, struct net_device *dev, + __be16 dst_port) +{ + struct lisp_net *ln = net_generic(net, lisp_net_id); + struct lisp_dev *lisp = netdev_priv(dev); + int err; + + lisp->net = net; + lisp->dev = dev; + + lisp->dst_port = dst_port; + + if (find_dev(net, dst_port)) + return -EBUSY; + + err = register_netdevice(dev); + if (err) + return err; + + list_add(&lisp->next, &ln->lisp_list); + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +static int lisp_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ +#else +static int lisp_newlink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) + +{ + struct net *net = &init_net; +#endif + __be16 dst_port = htons(LISP_UDP_PORT); + + if (data[IFLA_LISP_PORT]) + dst_port = nla_get_be16(data[IFLA_LISP_PORT]); + + return lisp_configure(net, dev, dst_port); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +static void lisp_dellink(struct net_device *dev, struct list_head *head) +#else +static void lisp_dellink(struct net_device *dev) +#endif +{ + struct lisp_dev *lisp = netdev_priv(dev); + + list_del(&lisp->next); + unregister_netdevice_queue(dev, head); +} + +static size_t lisp_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(__be32)); /* IFLA_LISP_PORT */ +} + +static int lisp_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct lisp_dev *lisp = netdev_priv(dev); + + if (nla_put_be16(skb, IFLA_LISP_PORT, lisp->dst_port)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static struct rtnl_link_ops lisp_link_ops __read_mostly = { + .kind = "lisp", + .maxtype = IFLA_LISP_MAX, + .policy = lisp_policy, + .priv_size = sizeof(struct lisp_dev), + .setup = lisp_setup, + .validate = lisp_validate, + .newlink = lisp_newlink, + .dellink = lisp_dellink, + .get_size = lisp_get_size, + .fill_info = lisp_fill_info, +}; + +struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + int err; + + memset(tb, 0, sizeof(tb)); + dev = rtnl_create_link(net, (char *) name, name_assign_type, + &lisp_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = lisp_configure(net, dev, htons(dst_port)); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + return dev; +} +EXPORT_SYMBOL_GPL(rpl_lisp_dev_create_fb); + +static int lisp_init_net(struct net *net) +{ + struct lisp_net *ln = net_generic(net, lisp_net_id); + + INIT_LIST_HEAD(&ln->lisp_list); + return 0; +} + +static void lisp_exit_net(struct net *net) +{ + struct lisp_net *ln = net_generic(net, lisp_net_id); + struct lisp_dev *lisp, *next; + struct net_device *dev, *aux; + LIST_HEAD(list); + + rtnl_lock(); + + /* gather any lisp devices that were moved into this ns */ + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &lisp_link_ops) + unregister_netdevice_queue(dev, &list); + + list_for_each_entry_safe(lisp, next, &ln->lisp_list, next) { + /* If lisp->dev is in the same netns, it was already added + * to the lisp by the previous loop. + */ + if (!net_eq(dev_net(lisp->dev), net)) + unregister_netdevice_queue(lisp->dev, &list); + } + + /* unregister the devices gathered above */ + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations lisp_net_ops = { + .init = lisp_init_net, + .exit = lisp_exit_net, + .id = &lisp_net_id, + .size = sizeof(struct lisp_net), +}; + +DEFINE_COMPAT_PNET_REG_FUNC(device) +int rpl_lisp_init_module(void) +{ + int rc; + + rc = register_pernet_subsys(&lisp_net_ops); + if (rc) + goto out1; + + rc = rtnl_link_register(&lisp_link_ops); + if (rc) + goto out2; + + pr_info("LISP tunneling driver\n"); + return 0; +out2: + unregister_pernet_subsys(&lisp_net_ops); +out1: + return rc; +} + +void rpl_lisp_cleanup_module(void) +{ + rtnl_link_unregister(&lisp_link_ops); + unregister_pernet_subsys(&lisp_net_ops); +} diff --git a/datapath/linux/compat/netdevice.c b/datapath/linux/compat/netdevice.c index 483d665d8..e28b878ee 100644 --- a/datapath/linux/compat/netdevice.c +++ b/datapath/linux/compat/netdevice.c @@ -117,3 +117,122 @@ struct sk_buff *rpl__skb_gso_segment(struct sk_buff *skb, EXPORT_SYMBOL_GPL(rpl__skb_gso_segment); #endif /* OVS_USE_COMPAT_GSO_SEGMENTATION */ + +#ifdef HAVE_UDP_OFFLOAD +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) +struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff *p, **pp = NULL; + struct ethhdr *eh, *eh2; + unsigned int hlen, off_eth; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_eth = skb_gro_offset(skb); + hlen = off_eth + sizeof(*eh); + eh = skb_gro_header_fast(skb, off_eth); + if (skb_gro_header_hard(skb, hlen)) { + eh = skb_gro_header_slow(skb, hlen, off_eth); + if (unlikely(!eh)) + goto out; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + eh2 = (struct ethhdr *)(p->data + off_eth); + if (compare_ether_header(eh, eh2)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + type = eh->h_proto; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) { + flush = 1; + goto out_unlock; + } + + skb_gro_pull(skb, sizeof(*eh)); + skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); + __be16 type = eh->h_proto; + struct packet_offload *ptype; + int err = -ENOSYS; + + if (skb->encapsulation) + skb_set_inner_mac_header(skb, nhoff); + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + + sizeof(struct ethhdr)); + + rcu_read_unlock(); + return err; +} + +#endif +#endif /* HAVE_UDP_OFFLOAD */ + +#ifndef HAVE_RTNL_LINK_STATS64 +#undef dev_get_stats +struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + const struct net_device_stats *stats = dev_get_stats(dev); + +#define copy(s) storage->s = stats->s + + copy(rx_packets); + copy(tx_packets); + copy(rx_bytes); + copy(tx_bytes); + copy(rx_errors); + copy(tx_errors); + copy(rx_dropped); + copy(tx_dropped); + copy(multicast); + copy(collisions); + + copy(rx_length_errors); + copy(rx_over_errors); + copy(rx_crc_errors); + copy(rx_frame_errors); + copy(rx_fifo_errors); + copy(rx_missed_errors); + + copy(tx_aborted_errors); + copy(tx_carrier_errors); + copy(tx_fifo_errors); + copy(tx_heartbeat_errors); + copy(tx_window_errors); + + copy(rx_compressed); + copy(tx_compressed); + +#undef copy + return storage; +} +#endif diff --git a/datapath/linux/compat/skbuff-openvswitch.c b/datapath/linux/compat/skbuff-openvswitch.c index fad1cc7d0..c46798df6 100644 --- a/datapath/linux/compat/skbuff-openvswitch.c +++ b/datapath/linux/compat/skbuff-openvswitch.c @@ -2,6 +2,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> +#include <linux/kconfig.h> #include "gso.h" @@ -280,3 +281,34 @@ void rpl_kfree_skb_list(struct sk_buff *segs) } EXPORT_SYMBOL(rpl_kfree_skb_list); #endif + +#ifndef HAVE_SKB_SCRUB_PACKET_XNET + +#define nf_reset_trace rpl_nf_reset_trace +static void nf_reset_trace(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) + skb->nf_trace = 0; +#endif +} + +void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + skb->skb_iif = 0; +#endif + skb->ignore_df = 0; + skb_dst_drop(skb); + secpath_reset(skb); + nf_reset(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + skb_orphan(skb); + skb->mark = 0; +} +#endif diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c index 0659c0b63..107aa2bb2 100644 --- a/datapath/linux/compat/stt.c +++ b/datapath/linux/compat/stt.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/unaligned.h> #include <linux/delay.h> @@ -28,9 +29,11 @@ #include <linux/tcp.h> #include <linux/workqueue.h> +#include <net/dst_metadata.h> #include <net/icmp.h> #include <net/inet_ecn.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/ip6_checksum.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -40,10 +43,29 @@ #include <net/udp.h> #include "gso.h" +#include "compat.h" + +#define STT_NETDEV_VER "0.1" +#define STT_DST_PORT 7471 #ifdef OVS_STT #define STT_VER 0 +/* @list: Per-net list of STT ports. + * @rcv: The callback is called on STT packet recv, STT reassembly can generate + * multiple packets, in this case first packet has tunnel outer header, rest + * of the packets are inner packet segments with no stt header. + * @rcv_data: user data. + * @sock: Fake TCP socket for the STT port. + */ +struct stt_dev { + struct net_device *dev; + struct net *net; + struct list_head next; + struct socket *sock; + __be16 dst_port; +}; + #define STT_CSUM_VERIFIED BIT(0) #define STT_CSUM_PARTIAL BIT(1) #define STT_PROTO_IPV4 BIT(2) @@ -127,7 +149,8 @@ struct frag_skb_cb { /* per-network namespace private data for this module */ struct stt_net { - struct list_head sock_list; + struct list_head stt_list; + int n_tunnels; }; static int stt_net_id; @@ -144,14 +167,14 @@ static DEFINE_PER_CPU(u32, pkt_seq_counter); static void clean_percpu(struct work_struct *work); static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu); -static struct stt_sock *stt_find_sock(struct net *net, __be16 port) +static struct stt_dev *stt_find_sock(struct net *net, __be16 port) { struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_sock *stt_sock; + struct stt_dev *stt_dev; - list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) { - if (inet_sk(stt_sock->sock->sk)->inet_sport == port) - return stt_sock; + list_for_each_entry_rcu(stt_dev, &sn->stt_list, next) { + if (inet_sk(stt_dev->sock->sk)->inet_sport == port) + return stt_dev; } return NULL; } @@ -788,7 +811,6 @@ static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, if (next) dst_clone(&rt->dst); - skb_clear_ovs_gso_cb(skb); skb->next = NULL; len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP, tos, ttl, df, false); @@ -835,7 +857,7 @@ static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto) return 0; } -int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, +static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, __be64 tun_id) @@ -906,7 +928,57 @@ err_free_rt: kfree_skb(skb); return ret; } -EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb); + +netdev_tx_t ovs_stt_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct stt_dev *stt_dev = netdev_priv(dev); + struct net *net = stt_dev->net; + __be16 dport = inet_sk(stt_dev->sock->sk)->inet_sport; + struct ip_tunnel_key *tun_key; + struct ip_tunnel_info *tun_info; + struct rtable *rt; + struct flowi4 fl; + __be16 sport; + __be16 df; + int err; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->key; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->u.ipv4.dst; + fl.saddr = tun_key->u.ipv4.src; + fl.flowi4_tos = RT_TOS(tun_key->tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_TCP; + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + skb->ignore_df = 1; + + err = stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst, + tun_key->tos, tun_key->ttl, + df, sport, dport, tun_key->tun_id); + iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); + return NETDEV_TX_OK; +error: + kfree_skb(skb); + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} +EXPORT_SYMBOL(ovs_stt_xmit); static void free_frag(struct stt_percpu *stt_percpu, struct pkt_frag *frag) @@ -1213,7 +1285,41 @@ static bool set_offloads(struct sk_buff *skb) return true; } -static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) + +#ifndef HAVE_METADATA_DST +static int __rcv(struct stt_dev *stt_dev, struct sk_buff *skb) +{ + struct metadata_dst tun_dst; + + ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, TUNNEL_KEY | TUNNEL_CSUM, + get_unaligned(&stt_hdr(skb)->key), 0); + tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source; + tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest; + + ovs_ip_tunnel_rcv(stt_dev->dev, skb, &tun_dst); + return 0; +} +#else +static int __rcv(struct stt_dev *stt_dev, struct sk_buff *skb) +{ + struct metadata_dst *tun_dst; + __be16 flags; + __be64 tun_id; + + flags = TUNNEL_KEY | TUNNEL_CSUM; + tun_id = get_unaligned(&stt_hdr(skb)->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return -ENOMEM; + tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source; + tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest; + + ovs_ip_tunnel_rcv(stt_dev->dev, skb, tun_dst); + return 0; +} + +#endif +static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb) { int err; @@ -1242,11 +1348,14 @@ static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) if (skb_shinfo(skb)->frag_list && try_to_segment(skb)) goto drop; - stt_sock->rcv(stt_sock, skb); + err = __rcv(stt_dev, skb); + if (err) + goto drop; return; drop: /* Consume bad packet */ kfree_skb_list(skb); + stt_dev->dev->stats.rx_errors++; } static void tcp_sock_release(struct socket *sock) @@ -1324,15 +1433,27 @@ static void clean_percpu(struct work_struct *work) #define FIRST_PARAM unsigned int hooknum #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) +#ifdef HAVE_NF_HOOK_STATE +#if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0) +/* RHEL nfhook hacks. */ +#ifndef __GENKSYMS__ +#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ + const struct nf_hook_state *state +#else +#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ + int (*okfn)(struct sk_buff *) +#endif +#else #define LAST_PARAM const struct nf_hook_state *state +#endif #else -#define LAST_PARAM const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *) +#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ + int (*okfn)(struct sk_buff *) #endif static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM) { - struct stt_sock *stt_sock; + struct stt_dev *stt_dev; int ip_hdr_len; if (ip_hdr(skb)->protocol != IPPROTO_TCP) @@ -1344,12 +1465,12 @@ static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM) skb_set_transport_header(skb, ip_hdr_len); - stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest); - if (!stt_sock) + stt_dev = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest); + if (!stt_dev) return NF_ACCEPT; __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr)); - stt_rcv(stt_sock, skb); + stt_rcv(stt_dev, skb); return NF_STOLEN; } @@ -1361,8 +1482,9 @@ static struct nf_hook_ops nf_hook_ops __read_mostly = { .priority = INT_MAX, }; -static int stt_start(void) +static int stt_start(struct net *net) { + struct stt_net *sn = net_generic(net, stt_net_id); int err; int i; @@ -1401,12 +1523,25 @@ static int stt_start(void) if (err) goto free_percpu; } + schedule_clean_percpu(); + n_tunnels++; + + if (sn->n_tunnels) { + sn->n_tunnels++; + return 0; + } +#ifdef HAVE_NF_REGISTER_NET_HOOK + /* On kernel which support per net nf-hook, nf_register_hook() takes + * rtnl-lock, which results in dead lock in stt-dev-create. Therefore + * use this new API. + */ + err = nf_register_net_hook(net, &nf_hook_ops); +#else err = nf_register_hook(&nf_hook_ops); +#endif if (err) goto free_percpu; - - schedule_clean_percpu(); - n_tunnels++; + sn->n_tunnels++; return 0; free_percpu: @@ -1423,17 +1558,26 @@ error: return err; } -static void stt_cleanup(void) +static void stt_cleanup(struct net *net) { + struct stt_net *sn = net_generic(net, stt_net_id); int i; + sn->n_tunnels--; + if (sn->n_tunnels) + goto out; +#ifdef HAVE_NF_REGISTER_NET_HOOK + nf_unregister_net_hook(net, &nf_hook_ops); +#else + nf_unregister_hook(&nf_hook_ops); +#endif + +out: n_tunnels--; if (n_tunnels) return; cancel_delayed_work_sync(&clean_percpu_wq); - nf_unregister_hook(&nf_hook_ops); - for_each_possible_cpu(i) { struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); int j; @@ -1451,102 +1595,306 @@ static void stt_cleanup(void) free_percpu(stt_percpu_data); } -static struct stt_sock *stt_socket_create(struct net *net, __be16 port, - stt_rcv_t *rcv, void *data) +static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev) { - struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_sock *stt_sock; - struct socket *sock; +#ifdef HAVE_METADATA_DST + return ovs_stt_xmit(skb); +#else + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +#endif +} + +/* Setup stats when device is created */ +static int stt_init(struct net_device *dev) +{ + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static void stt_uninit(struct net_device *dev) +{ + free_percpu(dev->tstats); +} + +static int stt_open(struct net_device *dev) +{ + struct stt_dev *stt = netdev_priv(dev); + struct net *net = stt->net; int err; - stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL); - if (!stt_sock) - return ERR_PTR(-ENOMEM); + err = stt_start(net); + if (err) + return err; - err = tcp_sock_create4(net, port, &sock); - if (err) { - kfree(stt_sock); - return ERR_PTR(err); - } + err = tcp_sock_create4(net, stt->dst_port, &stt->sock); + if (err) + return err; + return 0; +} - stt_sock->sock = sock; - stt_sock->rcv = rcv; - stt_sock->rcv_data = data; +static int stt_stop(struct net_device *dev) +{ + struct stt_dev *stt_dev = netdev_priv(dev); + struct net *net = stt_dev->net; - list_add_rcu(&stt_sock->list, &sn->sock_list); + tcp_sock_release(stt_dev->sock); + stt_dev->sock = NULL; + stt_cleanup(net); + return 0; +} - return stt_sock; +static const struct net_device_ops stt_netdev_ops = { + .ndo_init = stt_init, + .ndo_uninit = stt_uninit, + .ndo_open = stt_open, + .ndo_stop = stt_stop, + .ndo_start_xmit = stt_dev_xmit, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_change_mtu = eth_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, +}; + +static void stt_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version)); + strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver)); } -static void __stt_sock_release(struct stt_sock *stt_sock) +static const struct ethtool_ops stt_ethtool_ops = { + .get_drvinfo = stt_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +/* Info for udev, that this is a virtual tunnel endpoint */ +static struct device_type stt_type = { + .name = "stt", +}; + +/* Initialize the device structure. */ +static void stt_setup(struct net_device *dev) { - list_del_rcu(&stt_sock->list); - tcp_sock_release(stt_sock->sock); - kfree_rcu(stt_sock, rcu); + ether_setup(dev); + + dev->netdev_ops = &stt_netdev_ops; + dev->ethtool_ops = &stt_ethtool_ops; + dev->destructor = free_netdev; + + SET_NETDEV_DEVTYPE(dev, &stt_type); + + dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_GSO_SOFTWARE; + + dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + +#ifdef HAVE_METADATA_DST + netif_keep_dst(dev); +#endif + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + eth_hw_addr_random(dev); } -struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port, - stt_rcv_t *rcv, void *data) +static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = { + [IFLA_STT_PORT] = { .type = NLA_U16 }, +}; + +static int stt_validate(struct nlattr *tb[], struct nlattr *data[]) { - struct stt_sock *stt_sock; + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + return 0; +} + +static struct stt_dev *find_dev(struct net *net, __be16 dst_port) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_dev *dev; + + list_for_each_entry(dev, &sn->stt_list, next) { + if (dev->dst_port == dst_port) + return dev; + } + return NULL; +} + +static int stt_configure(struct net *net, struct net_device *dev, + __be16 dst_port) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_dev *stt = netdev_priv(dev); int err; - err = stt_start(); + stt->net = net; + stt->dev = dev; + + stt->dst_port = dst_port; + + if (find_dev(net, dst_port)) + return -EBUSY; + + err = register_netdevice(dev); if (err) - return ERR_PTR(err); + return err; - mutex_lock(&stt_mutex); - rcu_read_lock(); - stt_sock = stt_find_sock(net, port); - rcu_read_unlock(); - if (stt_sock) - stt_sock = ERR_PTR(-EBUSY); - else - stt_sock = stt_socket_create(net, port, rcv, data); + list_add(&stt->next, &sn->stt_list); + return 0; +} + +static int stt_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 dst_port = htons(STT_DST_PORT); + + if (data[IFLA_STT_PORT]) + dst_port = nla_get_be16(data[IFLA_STT_PORT]); + + return stt_configure(net, dev, dst_port); +} + +static void stt_dellink(struct net_device *dev, struct list_head *head) +{ + struct stt_dev *stt = netdev_priv(dev); - mutex_unlock(&stt_mutex); + list_del(&stt->next); + unregister_netdevice_queue(dev, head); +} + +static size_t stt_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */ +} + +static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct stt_dev *stt = netdev_priv(dev); + + if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port)) + goto nla_put_failure; - if (IS_ERR(stt_sock)) - stt_cleanup(); + return 0; - return stt_sock; +nla_put_failure: + return -EMSGSIZE; } -EXPORT_SYMBOL_GPL(rpl_stt_sock_add); -void rpl_stt_sock_release(struct stt_sock *stt_sock) +static struct rtnl_link_ops stt_link_ops __read_mostly = { + .kind = "stt", + .maxtype = IFLA_STT_MAX, + .policy = stt_policy, + .priv_size = sizeof(struct stt_dev), + .setup = stt_setup, + .validate = stt_validate, + .newlink = stt_newlink, + .dellink = stt_dellink, + .get_size = stt_get_size, + .fill_info = stt_fill_info, +}; + +struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, + u8 name_assign_type, u16 dst_port) { - mutex_lock(&stt_mutex); - if (stt_sock) { - __stt_sock_release(stt_sock); - stt_cleanup(); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + int err; + + memset(tb, 0, sizeof(tb)); + dev = rtnl_create_link(net, (char *) name, name_assign_type, + &stt_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = stt_configure(net, dev, htons(dst_port)); + if (err) { + free_netdev(dev); + return ERR_PTR(err); } - mutex_unlock(&stt_mutex); + return dev; } -EXPORT_SYMBOL_GPL(rpl_stt_sock_release); +EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb); static int stt_init_net(struct net *net) { struct stt_net *sn = net_generic(net, stt_net_id); - INIT_LIST_HEAD(&sn->sock_list); + INIT_LIST_HEAD(&sn->stt_list); return 0; } +static void stt_exit_net(struct net *net) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_dev *stt, *next; + struct net_device *dev, *aux; + LIST_HEAD(list); + + rtnl_lock(); + + /* gather any stt devices that were moved into this ns */ + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &stt_link_ops) + unregister_netdevice_queue(dev, &list); + + list_for_each_entry_safe(stt, next, &sn->stt_list, next) { + /* If stt->dev is in the same netns, it was already added + * to the stt by the previous loop. + */ + if (!net_eq(dev_net(stt->dev), net)) + unregister_netdevice_queue(stt->dev, &list); + } + + /* unregister the devices gathered above */ + unregister_netdevice_many(&list); + rtnl_unlock(); +} + static struct pernet_operations stt_net_ops = { .init = stt_init_net, + .exit = stt_exit_net, .id = &stt_net_id, .size = sizeof(struct stt_net), }; -int ovs_stt_init_module(void) +int stt_init_module(void) { - return register_pernet_subsys(&stt_net_ops); + int rc; + + rc = register_pernet_subsys(&stt_net_ops); + if (rc) + goto out1; + + rc = rtnl_link_register(&stt_link_ops); + if (rc) + goto out2; + + pr_info("STT tunneling driver\n"); + return 0; +out2: + unregister_pernet_subsys(&stt_net_ops); +out1: + return rc; } -EXPORT_SYMBOL_GPL(ovs_stt_init_module); -void ovs_stt_cleanup_module(void) +void stt_cleanup_module(void) { + rtnl_link_unregister(&stt_link_ops); unregister_pernet_subsys(&stt_net_ops); } -EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module); #endif diff --git a/datapath/linux/compat/udp_tunnel.c b/datapath/linux/compat/udp_tunnel.c index 19a1ea562..f72e64563 100644 --- a/datapath/linux/compat/udp_tunnel.c +++ b/datapath/linux/compat/udp_tunnel.c @@ -1,6 +1,6 @@ #include <linux/version.h> -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) +#ifndef HAVE_METADATA_DST #include <linux/module.h> #include <linux/errno.h> @@ -12,6 +12,9 @@ #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> +#include <net/ip6_checksum.h> +#include <net/ip6_tunnel.h> + int rpl_udp_sock_create(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -168,4 +171,97 @@ void rpl_udp_tunnel_sock_release(struct socket *sock) } EXPORT_SYMBOL_GPL(rpl_udp_tunnel_sock_release); -#endif /* Linux version < 4.0 */ +#if IS_ENABLED(CONFIG_IPV6) + +#define udp_v6_check rpl_udp_v6_check +static __sum16 udp_v6_check(int len, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __wsum base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); +} + +#define udp6_set_csum rpl_udp6_set_csum +static void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v6_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} + +#define ip6_flow_hdr rpl_ip6_flow_hdr +static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, + __be32 flowlabel) +{ + *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; +} + +int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be16 src_port, + __be16 dst_port, bool nocheck) +{ + struct udphdr *uh; + struct ipv6hdr *ip6h; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst); + + udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + ip6tunnel_xmit(sk, skb, dev); + return 0; +} +#endif +#endif diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c index fd454ae29..4076a2fd3 100644 --- a/datapath/linux/compat/vxlan.c +++ b/datapath/linux/compat/vxlan.c @@ -1,25 +1,13 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * VXLAN: Virtual eXtensible Local Area Network * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * Copyright (c) 2012-2013 Vyatta Inc. * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - * - * This code is derived from kernel vxlan module. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ -#include <linux/version.h> - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> @@ -39,10 +27,10 @@ #include <linux/if_vlan.h> #include <linux/hash.h> #include <linux/ethtool.h> +#include <linux/netdev_features.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/ip.h> -#include <net/gre.h> #include <net/ip_tunnels.h> #include <net/icmp.h> #include <net/udp.h> @@ -54,27 +42,878 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/vxlan.h> +#include <net/protocol.h> +#include <net/udp_tunnel.h> +#include <net/ip6_route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> +#endif +#include <net/dst_metadata.h> -#include "compat.h" -#include "datapath.h" +#ifndef HAVE_METADATA_DST #include "gso.h" -#include "vlan.h" +#include "vport-netdev.h" + +#define VXLAN_VERSION "0.1" + +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) +#define FDB_AGE_DEFAULT 300 /* 5 min */ +#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ + +#ifndef NTF_SELF +#define NTF_SELF 0x02 +#endif + +/* UDP port for VXLAN traffic. + * The IANA assigned port is 4789, but the Linux default is 8472 + * for compatibility with early adopters. + */ +static unsigned short vxlan_port __read_mostly = 8472; +module_param_named(udp_port, vxlan_port, ushort, 0444); +MODULE_PARM_DESC(udp_port, "Destination UDP port"); + +static int vxlan_net_id; +static struct rtnl_link_ops vxlan_link_ops; -#ifndef USE_UPSTREAM_VXLAN +static const u8 all_zeros_mac[ETH_ALEN]; -/* VXLAN protocol header */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags); + +/* per-network namespace private data for this module */ +struct vxlan_net { + struct list_head vxlan_list; + struct hlist_head sock_list[PORT_HASH_SIZE]; + spinlock_t sock_lock; +}; + +/* Forwarding table entry */ +struct vxlan_fdb { + struct hlist_node hlist; /* linked list of entries */ + struct rcu_head rcu; + unsigned long updated; /* jiffies */ + unsigned long used; + struct list_head remotes; + u8 eth_addr[ETH_ALEN]; + u16 state; /* see ndm_state */ + u8 flags; /* see ndm_flags */ }; +/* salt for hash table */ +static u32 vxlan_salt __read_mostly; +static struct workqueue_struct *vxlan_wq; + +static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) +{ + return vs->flags & VXLAN_F_COLLECT_METADATA || + ip_tunnel_collect_metadata(); +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + if (a->sa.sa_family != b->sa.sa_family) + return false; + if (a->sa.sa_family == AF_INET6) + return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); + else + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_any(&ipa->sin6.sin6_addr); + else + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); + else + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +#else /* !CONFIG_IPV6 */ + +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +#endif + +/* Virtual Network hash table head */ +static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) +{ + return &vs->vni_list[hash_32(id, VNI_HASH_BITS)]; +} + +/* Socket hash table head */ +static inline struct hlist_head *vs_head(struct net *net, __be16 port) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + + return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* First remote destination for a forwarding entry. + * Guaranteed to be non-NULL because remotes are never deleted. + */ +static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) +{ + return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); +} + +static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) +{ + return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); +} + +/* Find VXLAN socket based on network namespace, address family and UDP port + * and enabled unshareable flags. + */ +static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, + __be16 port, u32 flags) +{ + struct vxlan_sock *vs; + + flags &= VXLAN_F_RCV_FLAGS; + + hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { + if (inet_sport(vs->sock->sk) == port && + vxlan_get_sk_family(vs) == family && + vs->flags == flags) + return vs; + } + return NULL; +} + +static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) +{ + struct vxlan_dev *vxlan; + + hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { + if (vxlan->default_dst.remote_vni == id) + return vxlan; + } + + return NULL; +} + +/* Look up VNI in a per net namespace table */ +static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, + sa_family_t family, __be16 port, + u32 flags) +{ + struct vxlan_sock *vs; + + vs = vxlan_find_sock(net, family, port, flags); + if (!vs) + return NULL; + + return vxlan_vs_find_vni(vs, id); +} + +/* Fill in neighbour message in skbuff. */ +static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, + const struct vxlan_fdb *fdb, + u32 portid, u32 seq, int type, unsigned int flags, + const struct vxlan_rdst *rdst) +{ + return -EINVAL; +} + +static inline size_t vxlan_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + + nla_total_size(sizeof(struct nda_cacheinfo)); +} + +static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, + struct vxlan_rdst *rd, int type) +{ + struct net *net = dev_net(vxlan->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); + if (err < 0) { + /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +/* Hash Ethernet address */ +static u32 eth_hash(const unsigned char *addr) +{ + u64 value = get_unaligned((u64 *)addr); + + /* only want 6 bytes */ +#ifdef __BIG_ENDIAN + value >>= 16; +#else + value <<= 16; +#endif + return hash_64(value, FDB_HASH_BITS); +} + +/* Hash chain to use given mac address */ +static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, + const u8 *mac) +{ + return &vxlan->fdb_head[eth_hash(mac)]; +} + +/* Look up Ethernet address in forwarding table */ +static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, + const u8 *mac) +{ + struct hlist_head *head = vxlan_fdb_head(vxlan, mac); + struct vxlan_fdb *f; + + hlist_for_each_entry_rcu(f, head, hlist) { + if (ether_addr_equal(mac, f->eth_addr)) + return f; + } + + return NULL; +} + +static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, + const u8 *mac) +{ + struct vxlan_fdb *f; + + f = __vxlan_find_mac(vxlan, mac); + if (f) + f->used = jiffies; + + return f; +} + +/* caller should hold vxlan->hash_lock */ +static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, + union vxlan_addr *ip, __be16 port, + __u32 vni, __u32 ifindex) +{ + struct vxlan_rdst *rd; + + list_for_each_entry(rd, &f->remotes, list) { + if (vxlan_addr_equal(&rd->remote_ip, ip) && + rd->remote_port == port && + rd->remote_vni == vni && + rd->remote_ifindex == ifindex) + return rd; + } + + return NULL; +} + +/* Replace destination of unicast mac */ +static int vxlan_fdb_replace(struct vxlan_fdb *f, + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) +{ + struct vxlan_rdst *rd; + + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (rd) + return 0; + + rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); + if (!rd) + return 0; + rd->remote_ip = *ip; + rd->remote_port = port; + rd->remote_vni = vni; + rd->remote_ifindex = ifindex; + return 1; +} + +/* Add/update destinations for multicast */ +static int vxlan_fdb_append(struct vxlan_fdb *f, + union vxlan_addr *ip, __be16 port, __u32 vni, + __u32 ifindex, struct vxlan_rdst **rdp) +{ + struct vxlan_rdst *rd; + + rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (rd) + return 0; + + rd = kmalloc(sizeof(*rd), GFP_ATOMIC); + if (rd == NULL) + return -ENOBUFS; + rd->remote_ip = *ip; + rd->remote_port = port; + rd->remote_vni = vni; + rd->remote_ifindex = ifindex; + + list_add_tail_rcu(&rd->list, &f->remotes); + + *rdp = rd; + return 1; +} + +#ifdef HAVE_UDP_OFFLOAD +#ifdef HAVE_NETIF_F_GSO_TUNNEL_REMCSUM +static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data, struct gro_remcsum *grc, + bool nopartial) +{ + size_t start, offset; + + if (skb->remcsum_offload) + return vh; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, + start, offset, grc, nopartial); + + skb->remcsum_offload = 1; + + return vh; +} +#else +static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data, struct gro_remcsum *grc, + bool nopartial) +{ + return NULL; +} +#endif + +#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF +static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +#else +static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) +#endif +{ +#ifdef HAVE_UDP_OFFLOAD_ARG_UOFF + struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, + udp_offloads); +#else + struct vxlan_sock *vs = NULL; +#endif + struct sk_buff *p, **pp = NULL; + struct vxlanhdr *vh, *vh2; + unsigned int hlen, off_vx; + int flush = 1; + u32 flags; + struct gro_remcsum grc; + + skb_gro_remcsum_init(&grc); + + off_vx = skb_gro_offset(skb); + hlen = off_vx + sizeof(*vh); + vh = skb_gro_header_fast(skb, off_vx); + if (skb_gro_header_hard(skb, hlen)) { + vh = skb_gro_header_slow(skb, hlen, off_vx); + if (unlikely(!vh)) + goto out; + } + + skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); + + flags = ntohl(vh->vx_flags); + + if ((flags & VXLAN_HF_RCO) && vs && (vs->flags & VXLAN_F_REMCSUM_RX)) { + + vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), + ntohl(vh->vx_vni), &grc, + !!(vs->flags & + VXLAN_F_REMCSUM_NOPARTIAL)); + + if (!vh) + goto out; + } + + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + vh2 = (struct vxlanhdr *)(p->data + off_vx); + if (vh->vx_flags != vh2->vx_flags || + vh->vx_vni != vh2->vx_vni) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + pp = eth_gro_receive(head, skb); + +out: + skb_gro_remcsum_cleanup(skb, &grc); + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) +#else +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) +#endif +{ + udp_tunnel_gro_complete(skb, nhoff); + + return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); +} + +/* Notify netdevs that UDP port started listening */ +static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) +{ + struct net_device *dev; + struct sock *sk = vs->sock->sk; + struct net *net = sock_net(sk); + sa_family_t sa_family = vxlan_get_sk_family(vs); + __be16 port = inet_sk(sk)->inet_sport; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&vs->udp_offloads); + if (err) + pr_warn("vxlan: udp_add_offload failed with status %d\n", err); + } + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_add_vxlan_port) + dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, + port); + } + rcu_read_unlock(); +} + +/* Notify netdevs that UDP port is no more listening */ +static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) +{ + struct net_device *dev; + struct sock *sk = vs->sock->sk; + struct net *net = sock_net(sk); + sa_family_t sa_family = vxlan_get_sk_family(vs); + __be16 port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->netdev_ops->ndo_del_vxlan_port) + dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, + port); + } + rcu_read_unlock(); + + if (sa_family == AF_INET) + udp_del_offload(&vs->udp_offloads); +} +#endif + +/* Add new entry to forwarding table -- assumes lock held */ +static int vxlan_fdb_create(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __u16 flags, + __be16 port, __u32 vni, __u32 ifindex, + __u8 ndm_flags) +{ + struct vxlan_rdst *rd = NULL; + struct vxlan_fdb *f; + int notify = 0; + + f = __vxlan_find_mac(vxlan, mac); + if (f) { + if (flags & NLM_F_EXCL) { + netdev_dbg(vxlan->dev, + "lost race to create %pM\n", mac); + return -EEXIST; + } + if (f->state != state) { + f->state = state; + f->updated = jiffies; + notify = 1; + } + if (f->flags != ndm_flags) { + f->flags = ndm_flags; + f->updated = jiffies; + notify = 1; + } + if ((flags & NLM_F_REPLACE)) { + /* Only change unicasts */ + if (!(is_multicast_ether_addr(f->eth_addr) || + is_zero_ether_addr(f->eth_addr))) { + notify |= vxlan_fdb_replace(f, ip, port, vni, + ifindex); + } else + return -EOPNOTSUPP; + } + if ((flags & NLM_F_APPEND) && + (is_multicast_ether_addr(f->eth_addr) || + is_zero_ether_addr(f->eth_addr))) { + int rc = vxlan_fdb_append(f, ip, port, vni, ifindex, + &rd); + + if (rc < 0) + return rc; + notify |= rc; + } + } else { + if (!(flags & NLM_F_CREATE)) + return -ENOENT; + + if (vxlan->cfg.addrmax && + vxlan->addrcnt >= vxlan->cfg.addrmax) + return -ENOSPC; + + /* Disallow replace to add a multicast entry */ + if ((flags & NLM_F_REPLACE) && + (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) + return -EOPNOTSUPP; + + netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); + f = kmalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -ENOMEM; + + notify = 1; + f->state = state; + f->flags = ndm_flags; + f->updated = f->used = jiffies; + INIT_LIST_HEAD(&f->remotes); + memcpy(f->eth_addr, mac, ETH_ALEN); + + vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); + + ++vxlan->addrcnt; + hlist_add_head_rcu(&f->hlist, + vxlan_fdb_head(vxlan, mac)); + } + + if (notify) { + if (rd == NULL) + rd = first_remote_rtnl(f); + vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH); + } + + return 0; +} + +static void vxlan_fdb_free(struct rcu_head *head) +{ + struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); + struct vxlan_rdst *rd, *nd; + + list_for_each_entry_safe(rd, nd, &f->remotes, list) + kfree(rd); + kfree(f); +} + +static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) +{ + netdev_dbg(vxlan->dev, + "delete %pM\n", f->eth_addr); + + --vxlan->addrcnt; + vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); + + hlist_del_rcu(&f->hlist); + call_rcu(&f->rcu, vxlan_fdb_free); +} + +/* Watch incoming packets to learn mapping between Ethernet address + * and Tunnel endpoint. + * Return true if packet is bogus and should be dropped. + */ +static bool vxlan_snoop(struct net_device *dev, + union vxlan_addr *src_ip, const u8 *src_mac) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_fdb *f; + + f = vxlan_find_mac(vxlan, src_mac); + if (likely(f)) { + struct vxlan_rdst *rdst = first_remote_rcu(f); + + if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) + return false; + + /* Don't migrate static entries, drop packets */ + if (f->state & NUD_NOARP) + return true; + + if (net_ratelimit()) + netdev_info(dev, + "%pM migrated from %pIS to %pIS\n", + src_mac, &rdst->remote_ip.sa, &src_ip->sa); + + rdst->remote_ip = *src_ip; + f->updated = jiffies; + vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH); + } else { + /* learned new entry */ + spin_lock(&vxlan->hash_lock); + + /* close off race between vxlan_flush and incoming packets */ + if (netif_running(dev)) + vxlan_fdb_create(vxlan, src_mac, src_ip, + NUD_REACHABLE, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->cfg.dst_port, + vxlan->default_dst.remote_vni, + 0, NTF_SELF); + spin_unlock(&vxlan->hash_lock); + } + + return false; +} + +/* See if multicast group is already in use by other ID */ +static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) +{ + struct vxlan_dev *vxlan; + + /* The vxlan_sock is only used by dev, leaving group has + * no effect on other vxlan devices. + */ + if (atomic_read(&dev->vn_sock->refcnt) == 1) + return false; + + list_for_each_entry(vxlan, &vn->vxlan_list, next) { + if (!netif_running(vxlan->dev) || vxlan == dev) + continue; + + if (vxlan->vn_sock != dev->vn_sock) + continue; + + if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, + &dev->default_dst.remote_ip)) + continue; + + if (vxlan->default_dst.remote_ifindex != + dev->default_dst.remote_ifindex) + continue; + + return true; + } + + return false; +} + +static void vxlan_sock_release(struct vxlan_sock *vs) +{ + struct sock *sk = vs->sock->sk; + struct net *net = sock_net(sk); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + + if (!atomic_dec_and_test(&vs->refcnt)) + return; + + spin_lock(&vn->sock_lock); + hlist_del_rcu(&vs->hlist); +#ifdef HAVE_UDP_OFFLOAD + vxlan_notify_del_rx_port(vs); +#endif + spin_unlock(&vn->sock_lock); + + queue_work(vxlan_wq, &vs->del_work); +} + +/* Update multicast group membership when first VNI on + * multicast address is brought up + */ +static int vxlan_igmp_join(struct vxlan_dev *vxlan) +{ + return -EINVAL; +} + +/* Inverse of vxlan_igmp_join when last VNI is brought down */ +static int vxlan_igmp_leave(struct vxlan_dev *vxlan) +{ + return -EINVAL; +} + +#ifdef HAVE_VXLAN_HF_RCO +static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, + size_t hdrlen, u32 data, bool nopartial) +{ + size_t start, offset, plen; + + if (skb->remcsum_offload) + return vh; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + if (!pskb_may_pull(skb, plen)) + return NULL; + + vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + + skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, + nopartial); + + return vh; +} +#endif + +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md, u32 vni, + struct metadata_dst *tun_dst) +{ + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; + struct vxlan_dev *vxlan; +#ifdef HAVE_DEV_TSTATS + struct pcpu_sw_netstats *stats; +#endif + union vxlan_addr saddr; + int err = 0; + union vxlan_addr *remote_ip; + + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_COLLECT_METADATA) + vni = 0; + + /* Is this VNI defined? */ + vxlan = vxlan_vs_find_vni(vs, vni); + if (!vxlan) + goto drop; + + remote_ip = &vxlan->default_dst.remote_ip; + skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); + skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) + goto drop; + + /* Re-examine inner Ethernet packet */ + if (remote_ip->sa.sa_family == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + + if (tun_dst) { + ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); + tun_dst = NULL; + } else { + goto drop; + } + + if ((vxlan->flags & VXLAN_F_LEARN) && + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) + goto drop; + + skb_reset_network_header(skb); + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_COLLECT_METADATA)) + skb->mark = md->gbp; + + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + + if (unlikely(err)) { + if (err > 1) { + ++vxlan->dev->stats.rx_frame_errors; + ++vxlan->dev->stats.rx_errors; + goto drop; + } + } + +#ifdef HAVE_DEV_TSTATS + stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)vxlan->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); +#endif + netdev_port_receive(skb, skb_tunnel_info(skb)); + return; +drop: + + /* Consume bad packet */ + kfree_skb(skb); +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; - struct vxlan_metadata md = {0}; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; + union { + struct metadata_dst dst; + char buf[sizeof(struct metadata_dst) + sizeof(*md)]; + } buf; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -93,73 +932,83 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) goto drop; + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; +#ifdef HAVE_VXLAN_HF_RCO + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, + !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); + if (!vxh) + goto drop; + + flags &= ~VXLAN_HF_RCO; + vni &= VXLAN_VNI_MASK; + } +#endif + + if (vxlan_collect_metadata(vs)) { + ovs_udp_tun_rx_dst(&buf.dst.u.tun_info, skb, AF_INET, TUNNEL_KEY, + cpu_to_be64(vni >> 8), sizeof(*md)); + + md = ip_tunnel_info_opts(&buf.dst.u.tun_info); + } else { + memset(md, 0, sizeof(*md)); + } + /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ + * used by VXLAN extensions if explicitly requested. + */ if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) { struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; - md.gbp = ntohs(gbp->policy_id); + md->gbp = ntohs(gbp->policy_id); + + buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) - md.gbp |= VXLAN_GBP_DONT_LEARN; + md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) - md.gbp |= VXLAN_GBP_POLICY_APPLIED; + md->gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } - if (flags || (vni & 0xff)) { + if (flags || vni & ~VXLAN_VNI_MASK) { /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatbility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ + * this as a malformed packet. This behavior diverges from + * VXLAN RFC (RFC7348) which stipulates that bits in reserved + * in reserved fields are to be ignored. The approach here + * maintains compatibility with previous stack code, and also + * is more robust and provides a little more security in + * adding extensions to VXLAN. + */ goto bad_flags; } - md.vni = vxh->vx_vni; - vs->rcv(vs, skb, &md); + vxlan_rcv(vs, skb, md, vni >> 8, &buf.dst); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; + bad_flags: - pr_debug("invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); + netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: /* Return non vxlan pkt */ return 1; } -static void vxlan_sock_put(struct sk_buff *skb) -{ - sock_put(skb->sk); -} - -/* On transmit, associate with the tunnel socket */ -static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb) -{ - skb_orphan(skb); - sock_hold(sk); - skb->sk = sk; - skb->destructor = vxlan_sock_put; -} - static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, struct vxlan_metadata *md) { @@ -180,15 +1029,130 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } -int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +#if IS_ENABLED(CONFIG_IPV6) +static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr, __u8 prio, __u8 ttl, + __be16 src_port, __be16 dst_port, __be32 vni, + struct vxlan_metadata *md, bool xnet, u32 vxflags) +{ + struct vxlanhdr *vxh; + int min_headroom; + int err; + bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX); + int type = 0; + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + /* Add support for remote csum. */ + if (!SKB_GSO_TUNNEL_REMCSUM) { + kfree_skb(skb); + err = -EOPNOTSUPP; + goto err; + } + } + } + + skb_scrub_packet(skb, xnet); + + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + VXLAN_HLEN + sizeof(struct ipv6hdr) + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + /* Need space for new headers (invalidates iph ptr) */ + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) { + kfree_skb(skb); + goto err; + } + + skb = vlan_hwaccel_push_inside(skb); + if (WARN_ON(!skb)) { + err = -ENOMEM; + goto err; + } + + skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true); + if (IS_ERR(skb)) { + err = -EINVAL; + goto err; + } + + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); + vxh->vx_flags = htonl(VXLAN_HF_VNI); + vxh->vx_vni = vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 hdrlen = sizeof(struct vxlanhdr); + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) + skb->encapsulation = 0; +#endif + } + } + + if (vxflags & VXLAN_F_GBP) + vxlan_build_gbp_hdr(vxh, vxflags, md); + + ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + + udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio, + ttl, src_port, dst_port, + !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX)); + return 0; +err: + dst_release(dst); + return err; +} +#endif + +static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); + int type = 0; + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + + if (!SKB_GSO_TUNNEL_REMCSUM) { + kfree_skb(skb); + return -EOPNOTSUPP; + } + } + } min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len + VXLAN_HLEN + sizeof(struct iphdr) @@ -205,28 +1169,601 @@ int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, if (WARN_ON(!skb)) return -ENOMEM; - skb = udp_tunnel_handle_offloads(skb, udp_sum, true); + skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true); if (IS_ERR(skb)) return PTR_ERR(skb); vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 hdrlen = sizeof(struct vxlanhdr); + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) + skb->encapsulation = 0; +#endif + } + } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, vxflags, md); - vxlan_set_owner(sk, skb); - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos, ttl, df, src_port, dst_port, xnet, - !udp_sum); + !(vxflags & VXLAN_F_UDP_CSUM)); +} + +static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, + struct vxlan_rdst *rdst, bool did_rsc) +{ + struct ip_tunnel_info *info; + struct vxlan_dev *vxlan = netdev_priv(dev); + struct sock *sk = vxlan->vn_sock->sock->sk; + unsigned short family = vxlan_get_sk_family(vxlan->vn_sock); + struct rtable *rt = NULL; + const struct iphdr *old_iph; + struct flowi4 fl4; + union vxlan_addr *dst; + union vxlan_addr remote_ip; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; + __be16 src_port = 0, dst_port; + u32 vni; + __be16 df = 0; + __u8 tos, ttl; + int err; + u32 flags = vxlan->flags; + + info = skb_tunnel_info(skb); + + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; + } else { + if (!info) { + WARN_ONCE(1, "%s: Missing encapsulation instructions\n", + dev->name); + goto drop; + } + if (family != ip_tunnel_info_af(info)) + goto drop; + + dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; + vni = be64_to_cpu(info->key.tun_id); + remote_ip.sa.sa_family = family; + if (family == AF_INET) + remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; + else + remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; + dst = &remote_ip; + } + + if (vxlan_addr_any(dst)) { + if (did_rsc) { + /* short-circuited back to local bridge */ + WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", + dev->name); + } + goto drop; + } + + old_iph = ip_hdr(skb); + + ttl = vxlan->cfg.ttl; + if (!ttl && vxlan_addr_multicast(dst)) + ttl = 1; + + tos = vxlan->cfg.tos; + if (tos == 1) + tos = ip_tunnel_get_dsfield(old_iph, skb); + + src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, + vxlan->cfg.port_max, true); + + if (info) { + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + + ttl = info->key.ttl; + tos = info->key.tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info); + } else { + md->gbp = skb->mark; + } + + if (dst->sa.sa_family == AF_INET) { + if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)) + df = htons(IP_DF); + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; + fl4.flowi4_tos = RT_TOS(tos); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = IPPROTO_UDP; + fl4.daddr = dst->sin.sin_addr.s_addr; + fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr; + + rt = ip_route_output_key(vxlan->net, &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.tx_carrier_errors++; + goto tx_error; + } + + if (rt_dst(rt).dev == dev) { + netdev_dbg(dev, "circular route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.collisions++; + goto rt_tx_error; + } + + /* Bypass encapsulation if the destination is local */ + if (rt->rt_flags & RTCF_LOCAL && + !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + ip_rt_put(rt); + dst_vxlan = vxlan_find_vni(vxlan->net, vni, + dst->sa.sa_family, dst_port, + vxlan->flags); + if (!dst_vxlan) + goto tx_error; + WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", + dev->name); + goto tx_error; + } + + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt)); + err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, + dst->sin.sin_addr.s_addr, tos, ttl, df, + src_port, dst_port, htonl(vni << 8), md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + flags); + if (err < 0) { + /* skb is already freed. */ + skb = NULL; + goto rt_tx_error; + } + + iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct dst_entry *ndst; + struct flowi6 fl6; + u32 rt6i_flags; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; + fl6.daddr = dst->sin6.sin6_addr; + fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = IPPROTO_UDP; + +#ifdef HAVE_IPV6_DST_LOOKUP_NET + if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) { +#else +#ifdef HAVE_IPV6_STUB + if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { +#else + ndst = ip6_route_output(vxlan->net, sk, &fl6); + if (ndst->error) { +#endif +#endif + netdev_dbg(dev, "no route to %pI6\n", + &dst->sin6.sin6_addr); + dev->stats.tx_carrier_errors++; + goto tx_error; + } + + if (ndst->dev == dev) { + netdev_dbg(dev, "circular route to %pI6\n", + &dst->sin6.sin6_addr); + dst_release(ndst); + dev->stats.collisions++; + goto tx_error; + } + + /* Bypass encapsulation if the destination is local */ + rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (rt6i_flags & RTF_LOCAL && + !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + dst_release(ndst); + dst_vxlan = vxlan_find_vni(vxlan->net, vni, + dst->sa.sa_family, dst_port, + vxlan->flags); + if (!dst_vxlan) + goto tx_error; + WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n", + dev->name); + goto tx_error; + } + + ttl = ttl ? : ip6_dst_hoplimit(ndst); + err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, + 0, ttl, src_port, dst_port, htonl(vni << 8), md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + flags); +#endif + } + + return; + +drop: + dev->stats.tx_dropped++; + goto tx_free; + +rt_tx_error: + ip_rt_put(rt); +tx_error: + dev->stats.tx_errors++; +tx_free: + dev_kfree_skb(skb); +} + +/* Transmit local packets over Vxlan + * + * Outer IP header inherits ECN and DF from inner header. + * Outer UDP destination is the VXLAN assigned port. + * source port is based on hash of flow + */ +netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct vxlan_dev *vxlan = netdev_priv(dev); + const struct ip_tunnel_info *info; + + info = skb_tunnel_info(skb); + + skb_reset_mac_header(skb); + + if ((vxlan->flags & VXLAN_F_PROXY)) + goto out; + + if (vxlan->flags & VXLAN_F_COLLECT_METADATA && + info && info->mode & IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } +out: + pr_warn("vxlan: unsupported flag set %x", vxlan->flags); + kfree_skb(skb); + return NETDEV_TX_OK; +} +EXPORT_SYMBOL(rpl_vxlan_xmit); + +/* Walk the forwarding table and purge stale entries */ +static void vxlan_cleanup(unsigned long arg) +{ + struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; + unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; + unsigned int h; + + if (!netif_running(vxlan->dev)) + return; + + for (h = 0; h < FDB_HASH_SIZE; ++h) { + struct hlist_node *p, *n; + + spin_lock_bh(&vxlan->hash_lock); + hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { + struct vxlan_fdb *f + = container_of(p, struct vxlan_fdb, hlist); + unsigned long timeout; + + if (f->state & NUD_PERMANENT) + continue; + + timeout = f->used + vxlan->cfg.age_interval * HZ; + if (time_before_eq(timeout, jiffies)) { + netdev_dbg(vxlan->dev, + "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; + vxlan_fdb_destroy(vxlan, f); + } else if (time_before(timeout, next_timer)) + next_timer = timeout; + } + spin_unlock_bh(&vxlan->hash_lock); + } + + mod_timer(&vxlan->age_timer, next_timer); } -EXPORT_SYMBOL_GPL(rpl_vxlan_xmit_skb); -static void rcu_free_vs(struct rcu_head *rcu) +static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + __u32 vni = vxlan->default_dst.remote_vni; + + vxlan->vn_sock = vs; + spin_lock(&vn->sock_lock); + hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + spin_unlock(&vn->sock_lock); +} + +/* Setup stats when device is created */ +#ifdef HAVE_DEV_TSTATS +static int vxlan_init(struct net_device *dev) +{ + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} +#endif + +static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) +{ + struct vxlan_fdb *f; + + spin_lock_bh(&vxlan->hash_lock); + f = __vxlan_find_mac(vxlan, all_zeros_mac); + if (f) + vxlan_fdb_destroy(vxlan, f); + spin_unlock_bh(&vxlan->hash_lock); +} + +#ifdef HAVE_DEV_TSTATS +static void vxlan_uninit(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + + vxlan_fdb_delete_default(vxlan); + + free_percpu(dev->tstats); +} +#endif + +/* Start ageing timer and join group when device is brought up */ +static int vxlan_open(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_sock *vs; + int ret = 0; + + vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, + vxlan->cfg.no_share, vxlan->flags); + if (IS_ERR(vs)) + return PTR_ERR(vs); + + vxlan_vs_add_dev(vs, vxlan); + + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { + ret = vxlan_igmp_join(vxlan); + if (ret == -EADDRINUSE) + ret = 0; + if (ret) { + vxlan_sock_release(vs); + return ret; + } + } + + if (vxlan->cfg.age_interval) + mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); + + return ret; +} + +/* Purge the forwarding table */ +static void vxlan_flush(struct vxlan_dev *vxlan) +{ + unsigned int h; + + spin_lock_bh(&vxlan->hash_lock); + for (h = 0; h < FDB_HASH_SIZE; ++h) { + struct hlist_node *p, *n; + + hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { + struct vxlan_fdb *f + = container_of(p, struct vxlan_fdb, hlist); + /* the all_zeros_mac entry is deleted at vxlan_uninit */ + if (!is_zero_ether_addr(f->eth_addr)) + vxlan_fdb_destroy(vxlan, f); + } + } + spin_unlock_bh(&vxlan->hash_lock); +} + +/* Cleanup timer and forwarding table on shutdown */ +static int vxlan_stop(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_sock *vs = vxlan->vn_sock; + int ret = 0; + + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + !vxlan_group_used(vn, vxlan)) + ret = vxlan_igmp_leave(vxlan); + + del_timer_sync(&vxlan->age_timer); + + vxlan_flush(vxlan); + vxlan_sock_release(vs); + + return ret; +} + +/* Stub, nothing needs to be done. */ +static void vxlan_set_multicast_list(struct net_device *dev) +{ +} + +static int vxlan_change_mtu(struct net_device *dev, int new_mtu) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_rdst *dst = &vxlan->default_dst; + struct net_device *lowerdev; + int max_mtu; + + lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); + if (lowerdev == NULL) + return eth_change_mtu(dev, new_mtu); + + if (dst->remote_ip.sa.sa_family == AF_INET6) + max_mtu = lowerdev->mtu - VXLAN6_HEADROOM; + else + max_mtu = lowerdev->mtu - VXLAN_HEADROOM; + + if (new_mtu < 68 || new_mtu > max_mtu) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + /* Drop All packets coming from networking stack. OVS-CB is + * not initialized for these packets. + */ + + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + +static const struct net_device_ops vxlan_netdev_ops = { +#ifdef HAVE_DEV_TSTATS + .ndo_init = vxlan_init, + .ndo_uninit = vxlan_uninit, + .ndo_get_stats64 = ip_tunnel_get_stats64, +#endif + .ndo_open = vxlan_open, + .ndo_stop = vxlan_stop, + .ndo_start_xmit = vxlan_dev_xmit, + .ndo_set_rx_mode = vxlan_set_multicast_list, + .ndo_change_mtu = vxlan_change_mtu, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_mac_address = eth_mac_addr, +}; + +/* Info for udev, that this is a virtual tunnel endpoint */ +static struct device_type vxlan_type = { + .name = "vxlan", +}; + +/* Initialize the device structure. */ +static void vxlan_setup(struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + unsigned int h; + + eth_hw_addr_random(dev); + ether_setup(dev); + + dev->netdev_ops = &vxlan_netdev_ops; + dev->destructor = free_netdev; + SET_NETDEV_DEVTYPE(dev, &vxlan_type); + + dev->features |= NETIF_F_LLTX; + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_GSO_SOFTWARE; + + dev->vlan_features = dev->features; + dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; +#endif + +#if 0 + netif_keep_dst(dev); +#endif + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + + INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); + + init_timer_deferrable(&vxlan->age_timer); + vxlan->age_timer.function = vxlan_cleanup; + vxlan->age_timer.data = (unsigned long) vxlan; + + vxlan->cfg.dst_port = htons(vxlan_port); + + vxlan->dev = dev; + + for (h = 0; h < FDB_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vxlan->fdb_head[h]); +} + +static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { + [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, +}; + +static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { + pr_debug("invalid link address (not ethernet)\n"); + return -EINVAL; + } + + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { + pr_debug("invalid all zero ethernet address\n"); + return -EADDRNOTAVAIL; + } + } + + if (!data) + return -EINVAL; + + if (data[IFLA_VXLAN_ID]) { + __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); + if (id >= VXLAN_VID_MASK) + return -ERANGE; + } + + if (data[IFLA_VXLAN_PORT_RANGE]) { + const struct ifla_vxlan_port_range *p + = nla_data(data[IFLA_VXLAN_PORT_RANGE]); + + if (ntohs(p->high) < ntohs(p->low)) { + pr_debug("port range %u .. %u not valid\n", + ntohs(p->low), ntohs(p->high)); + return -EINVAL; + } + } + + return 0; +} + +static void vxlan_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); + strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); +} + +static const struct ethtool_ops vxlan_ethtool_ops = { + .get_drvinfo = vxlan_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +static void free_vs_rcu(struct rcu_head *rcu) { struct vxlan_sock *vs = container_of(rcu, struct vxlan_sock, rcu); @@ -236,9 +1773,9 @@ static void rcu_free_vs(struct rcu_head *rcu) static void vxlan_del_work(struct work_struct *work) { struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); - udp_tunnel_sock_release(vs->sock); - call_rcu(&vs->rcu, rcu_free_vs); + + call_rcu(&vs->rcu, free_vs_rcu); } static struct socket *vxlan_create_sock(struct net *net, bool ipv6, @@ -252,13 +1789,11 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, if (ipv6) { udp_conf.family = AF_INET6; - /* The checksum flag is silently ignored but it - * doesn't make sense here anyways because OVS enables - * checksums on a finer granularity than per-socket. - */ + udp_conf.use_udp6_rx_checksums = + !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); + udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); } udp_conf.local_udp_port = port; @@ -271,32 +1806,51 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, return sock; } +/* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, u32 flags) + u32 flags) { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; + unsigned int h; + bool ipv6 = !!(flags & VXLAN_F_IPV6); struct udp_tunnel_sock_cfg tunnel_cfg; - vs = kmalloc(sizeof(*vs), GFP_KERNEL); - if (!vs) { - pr_debug("memory alocation failure\n"); + vs = kzalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) return ERR_PTR(-ENOMEM); - } + + for (h = 0; h < VNI_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vs->vni_list[h]); INIT_WORK(&vs->del_work, vxlan_del_work); - sock = vxlan_create_sock(net, false, port, flags); + sock = vxlan_create_sock(net, ipv6, port, flags); if (IS_ERR(sock)) { + pr_info("Cannot bind port %d, err=%ld\n", ntohs(port), + PTR_ERR(sock)); kfree(vs); return ERR_CAST(sock); } vs->sock = sock; - vs->rcv = rcv; - vs->data = data; + atomic_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); + /* Initialize the vxlan udp offloads structure */ +#ifdef HAVE_UDP_OFFLOAD + vs->udp_offloads.port = port; + vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; + vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; + vxlan_notify_add_rx_port(vs); +#endif + + spin_lock(&vn->sock_lock); + hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); + spin_unlock(&vn->sock_lock); + + /* Mark socket as an encapsulation socket. */ tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_udp_encap_recv; @@ -307,20 +1861,378 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags) +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags) { - return vxlan_socket_create(net, port, rcv, data, flags); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *vs; + bool ipv6 = flags & VXLAN_F_IPV6; + + if (!no_share) { + spin_lock(&vn->sock_lock); + vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, + flags); + if (vs) { + if (!atomic_add_unless(&vs->refcnt, 1, 0)) + vs = ERR_PTR(-EBUSY); + spin_unlock(&vn->sock_lock); + return vs; + } + spin_unlock(&vn->sock_lock); + } + + return vxlan_socket_create(net, port, flags); } -EXPORT_SYMBOL_GPL(rpl_vxlan_sock_add); -void rpl_vxlan_sock_release(struct vxlan_sock *vs) +static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, + struct vxlan_config *conf) { - ASSERT_OVSL(); + struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_rdst *dst = &vxlan->default_dst; + int err; + bool use_ipv6 = false; + __be16 default_port = vxlan->cfg.dst_port; + + vxlan->net = src_net; + + dst->remote_vni = conf->vni; + + memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); + + /* Unless IPv6 is explicitly requested, assume IPv4 */ + if (!dst->remote_ip.sa.sa_family) + dst->remote_ip.sa.sa_family = AF_INET; + + if (dst->remote_ip.sa.sa_family == AF_INET6 || + vxlan->cfg.saddr.sa.sa_family == AF_INET6) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + use_ipv6 = true; + } + + if (conf->remote_ifindex) { + struct net_device *lowerdev + = __dev_get_by_index(src_net, conf->remote_ifindex); + + dst->remote_ifindex = conf->remote_ifindex; + + if (!lowerdev) { + pr_info("ifindex %d does not exist\n", dst->remote_ifindex); + return -ENODEV; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (use_ipv6) { + struct inet6_dev *idev = __in6_dev_get(lowerdev); + if (idev && idev->cnf.disable_ipv6) { + pr_info("IPv6 is disabled via sysctl\n"); + return -EPERM; + } + vxlan->flags |= VXLAN_F_IPV6; + } +#endif - queue_work(system_wq, &vs->del_work); + if (!conf->mtu) + dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); + + dev->needed_headroom = lowerdev->hard_header_len + + (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); + } else if (use_ipv6) { + vxlan->flags |= VXLAN_F_IPV6; + dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; + } else { + dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; + } + + memcpy(&vxlan->cfg, conf, sizeof(*conf)); + if (!vxlan->cfg.dst_port) + vxlan->cfg.dst_port = default_port; + vxlan->flags |= conf->flags; + + if (!vxlan->cfg.age_interval) + vxlan->cfg.age_interval = FDB_AGE_DEFAULT; + + if (vxlan_find_vni(src_net, conf->vni, use_ipv6 ? AF_INET6 : AF_INET, + vxlan->cfg.dst_port, vxlan->flags)) + return -EEXIST; + + dev->ethtool_ops = &vxlan_ethtool_ops; + + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { + err = vxlan_fdb_create(vxlan, all_zeros_mac, + &vxlan->default_dst.remote_ip, + NUD_REACHABLE|NUD_PERMANENT, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->cfg.dst_port, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, + NTF_SELF); + if (err) + return err; + } + + err = register_netdevice(dev); + if (err) { + vxlan_fdb_delete_default(vxlan); + return err; + } + + list_add(&vxlan->next, &vn->vxlan_list); + + return 0; } -EXPORT_SYMBOL_GPL(rpl_vxlan_sock_release); -#endif /* !USE_UPSTREAM_VXLAN */ +struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, + u8 name_assign_type, struct vxlan_config *conf) +{ + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, (char *)name, name_assign_type, + &vxlan_link_ops, tb); + if (IS_ERR(dev)) + return dev; + + err = vxlan_dev_configure(net, dev, conf); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + + return dev; +} +EXPORT_SYMBOL_GPL(rpl_vxlan_dev_create); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +static int vxlan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +#else +static int vxlan_newlink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +#endif +{ + return -EINVAL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) +static void vxlan_dellink(struct net_device *dev, struct list_head *head) +#else +static void vxlan_dellink(struct net_device *dev) +#endif +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + + spin_lock(&vn->sock_lock); + if (!hlist_unhashed(&vxlan->hlist)) + hlist_del_rcu(&vxlan->hlist); + spin_unlock(&vn->sock_lock); + + list_del(&vxlan->next); + unregister_netdevice_queue(dev, head); +} + +static size_t vxlan_get_size(const struct net_device *dev) +{ + + return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ + nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ + nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ + nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ + nla_total_size(sizeof(struct ifla_vxlan_port_range)) + + nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ + 0; +} + +static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + const struct vxlan_dev *vxlan = netdev_priv(dev); + + if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +#ifdef HAVE_GET_LINK_NET +static struct net *vxlan_get_link_net(const struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + + return vxlan->net; +} +#endif + +static struct rtnl_link_ops vxlan_link_ops __read_mostly = { + .kind = "ovs_vxlan", + .maxtype = IFLA_VXLAN_MAX, + .policy = vxlan_policy, + .priv_size = sizeof(struct vxlan_dev), + .setup = vxlan_setup, + .validate = vxlan_validate, + .newlink = vxlan_newlink, + .dellink = vxlan_dellink, + .get_size = vxlan_get_size, + .fill_info = vxlan_fill_info, +#ifdef HAVE_GET_LINK_NET + .get_link_net = vxlan_get_link_net, +#endif +}; + +static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, + struct net_device *dev) +{ + struct vxlan_dev *vxlan, *next; + LIST_HEAD(list_kill); + + list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { + struct vxlan_rdst *dst = &vxlan->default_dst; + + /* In case we created vxlan device with carrier + * and we loose the carrier due to module unload + * we also need to remove vxlan device. In other + * cases, it's not necessary and remote_ifindex + * is 0 here, so no matches. + */ + if (dst->remote_ifindex == dev->ifindex) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + vxlan_dellink(vxlan->dev, &list_kill); +#else + vxlan_dellink(vxlan->dev); +#endif + } + + unregister_netdevice_many(&list_kill); +} + +static int vxlan_lowerdev_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); + + if (event == NETDEV_UNREGISTER) + vxlan_handle_lowerdev_unregister(vn, dev); + + return NOTIFY_DONE; +} + +static struct notifier_block vxlan_notifier_block __read_mostly = { + .notifier_call = vxlan_lowerdev_event, +}; + +static __net_init int vxlan_init_net(struct net *net) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + unsigned int h; + + INIT_LIST_HEAD(&vn->vxlan_list); + spin_lock_init(&vn->sock_lock); + + for (h = 0; h < PORT_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vn->sock_list[h]); + + return 0; +} + +static void __net_exit vxlan_exit_net(struct net *net) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_dev *vxlan, *next; + struct net_device *dev, *aux; + LIST_HEAD(list); + + rtnl_lock(); + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &vxlan_link_ops) + unregister_netdevice_queue(dev, &list); + + list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { + /* If vxlan->dev is in the same netns, it has already been added + * to the list by the previous loop. + */ + if (!net_eq(dev_net(vxlan->dev), net)) + unregister_netdevice_queue(vxlan->dev, &list); + } + + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations vxlan_net_ops = { + .init = vxlan_init_net, + .exit = vxlan_exit_net, + .id = &vxlan_net_id, + .size = sizeof(struct vxlan_net), +}; + +DEFINE_COMPAT_PNET_REG_FUNC(device) +int rpl_vxlan_init_module(void) +{ + int rc; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) + vxlan_wq = create_workqueue("vxlan"); +#else + vxlan_wq = alloc_workqueue("vxlan", 0, 0); +#endif + if (!vxlan_wq) + return -ENOMEM; + + get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); + + rc = register_pernet_subsys(&vxlan_net_ops); + if (rc) + goto out1; + + rc = register_netdevice_notifier(&vxlan_notifier_block); + if (rc) + goto out2; + + rc = rtnl_link_register(&vxlan_link_ops); + if (rc) + goto out3; + + pr_info("VxLAN tunneling driver\n"); + return 0; +out3: + unregister_netdevice_notifier(&vxlan_notifier_block); +out2: + unregister_pernet_subsys(&vxlan_net_ops); +out1: + destroy_workqueue(vxlan_wq); + return rc; +} + +void rpl_vxlan_cleanup_module(void) +{ + rtnl_link_unregister(&vxlan_link_ops); + unregister_netdevice_notifier(&vxlan_notifier_block); + destroy_workqueue(vxlan_wq); + unregister_pernet_subsys(&vxlan_net_ops); + /* rcu_barrier() is called by netns */ +} +#endif diff --git a/datapath/vport-geneve.c b/datapath/vport-geneve.c index 4ab224dac..3b5c1ab32 100644 --- a/datapath/vport-geneve.c +++ b/datapath/vport-geneve.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -26,96 +26,42 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_geneve_vport_ops; - /** * struct geneve_port - Keeps track of open UDP ports - * @gs: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct geneve_port { - struct geneve_sock *gs; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(geneve_ports); - static inline struct geneve_port *geneve_vport(const struct vport *vport) { return vport_priv(vport); } -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) -{ - struct vport *vport = gs->rcv_data; - struct genevehdr *geneveh = geneve_hdr(skb); - int opts_len; - struct ovs_tunnel_info tun_info; - __be64 key; - __be16 flags; - - opts_len = geneveh->opt_len * 4; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | - (geneveh->oam ? TUNNEL_OAM : 0) | - (geneveh->critical ? TUNNEL_CRIT_OPT : 0); - - key = vni_to_tunnel_id(geneveh->vni); - - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); - - ovs_vport_receive(vport, skb, &tun_info); -} - static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dst_port = inet_sport(geneve_port->gs->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) return -EMSGSIZE; return 0; } -static void geneve_tnl_destroy(struct vport *vport) +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct dp_upcall_info *upcall) { struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = htons(geneve_port->port_no); + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - geneve_sock_release(geneve_port->gs); - - ovs_vport_deferred_free(vport); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_UDP, sport, dport); } static struct vport *geneve_tnl_create(const struct vport_parms *parms) @@ -123,11 +69,11 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct geneve_port *geneve_port; - struct geneve_sock *gs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -149,111 +95,42 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - strncpy(geneve_port->name, parms->name, IFNAMSIZ); + geneve_port->port_no = dst_port; - gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); - if (IS_ERR(gs)) { + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)gs; + return ERR_CAST(dev); } - geneve_port->gs = gs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; - struct net *net = ovs_dp_get_net(vport->dp); - struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dport = inet_sport(geneve_port->gs->sock->sk); - __be16 sport; - __be32 saddr; - struct rtable *rt; - u8 vni[3], opts_len, *opts; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->tunnel; - - saddr = tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, tun_key->ipv4_dst, - IPPROTO_UDP, tun_key->ipv4_tos, - skb->mark); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - tunnel_id_to_vni(tun_key->tun_id, vni); - skb->ignore_df = 1; - - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { - opts = (u8 *)tun_info->options; - opts_len = tun_info->options_len; - } else { - opts = NULL; - opts_len = 0; - } - - err = geneve_xmit_skb(geneve_port->gs, rt, skb, saddr, - tun_key->ipv4_dst, tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, sport, dport, - tun_key->tun_flags, vni, opts_len, opts, - !!(tun_key->tun_flags & TUNNEL_CSUM), false); - if (err < 0) - ip_rt_put(rt); - return err; - -error: - kfree_skb(skb); - return err; -} - -static const char *geneve_get_name(const struct vport *vport) +static struct vport *geneve_create(const struct vport_parms *parms) { - struct geneve_port *geneve_port = geneve_vport(vport); - - return geneve_port->name; -} + struct vport *vport; -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sport(geneve_port->gs->sock->sk); - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to geneve_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_geneve_vport_ops = { - .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_tnl_create, - .destroy = geneve_tnl_destroy, - .get_name = geneve_get_name, - .get_options = geneve_get_options, - .send = geneve_tnl_send, + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = geneve_get_options, + .send = geneve_xmit, + .owner = THIS_MODULE, .get_egress_tun_info = geneve_get_egress_tun_info, - .owner = THIS_MODULE, }; static int __init ovs_geneve_tnl_init(void) @@ -269,6 +146,6 @@ static void __exit ovs_geneve_tnl_exit(void) module_init(ovs_geneve_tnl_init); module_exit(ovs_geneve_tnl_exit); -MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_DESCRIPTION("OVS: Geneve switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-5"); diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c index 0328fe51e..a9ac0d48a 100644 --- a/datapath/vport-gre.c +++ b/datapath/vport-gre.c @@ -16,9 +16,6 @@ * 02110-1301, USA */ -#include <linux/kconfig.h> -#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> @@ -48,256 +45,58 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_gre_vport_ops; -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 be64_get_low32(__be64 x) -{ -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - -static __be16 filter_tnl_flags(__be16 flags) -{ - return flags & (TUNNEL_CSUM | TUNNEL_KEY); -} - -static struct sk_buff *__build_header(struct sk_buff *skb, - int tunnel_hlen) -{ - struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) - return skb; - - tpi.flags = filter_tnl_flags(tun_key->tun_flags); - tpi.proto = htons(ETH_P_TEB); - tpi.key = be64_get_low32(tun_key->tun_id); - tpi.seq = 0; - gre_build_header(skb, &tpi, tunnel_hlen); - - return skb; -} - -static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +static struct vport *gre_tnl_create(const struct vport_parms *parms) { -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u64)seq << 32 | (__force u32)key); -#else - return (__force __be64)((__force u64)key << 32 | (__force u32)seq); -#endif -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_rcv(struct sk_buff *skb, - const struct tnl_ptk_info *tpi) -{ - struct ovs_tunnel_info tun_info; - struct ovs_net *ovs_net; - struct vport *vport; - __be64 key; - - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - if (unlikely(!vport)) - return PACKET_REJECT; - - key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); - - ovs_vport_receive(vport, skb, &tun_info); - return PACKET_RCVD; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) -{ - struct ovs_net *ovs_net; + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; struct vport *vport; - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - - if (unlikely(!vport)) - return PACKET_REJECT; - else - return PACKET_RCVD; -} - -static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - const struct ovs_key_ipv4_tunnel *tun_key; - struct rtable *rt; - int min_headroom; - __be16 df; - int tunnel_hlen; - __be32 saddr; - int err; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto err_free_skb; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - saddr = tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, tun_key->ipv4_dst, - IPPROTO_GRE, tun_key->ipv4_tos, - skb->mark); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto err_free_skb; - } - - tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len - + tunnel_hlen + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) { - err = -ENOMEM; - goto err_free_rt; - } - - /* Push Tunnel header. */ - skb = __build_header(skb, tunnel_hlen); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); } - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - return iptunnel_xmit(skb->sk, rt, skb, saddr, - tun_key->ipv4_dst, IPPROTO_GRE, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - return err; -} - -static struct gre_cisco_protocol gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .priority = 1, -}; - -static int gre_ports; -static int gre_init(void) -{ - int err; - - gre_ports++; - if (gre_ports > 1) - return 0; - - err = gre_cisco_register(&gre_protocol); - if (err) - pr_warn("cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - gre_ports--; - if (gre_ports > 0) - return; - - gre_cisco_unregister(&gre_protocol); -} + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); -static const char *gre_get_name(const struct vport *vport) -{ - return vport_priv(vport); + return vport; } static struct vport *gre_create(const struct vport_parms *parms) { - struct net *net = ovs_dp_get_net(parms->dp); - struct ovs_net *ovs_net; struct vport *vport; - int err; - - err = gre_init(); - if (err) - return ERR_PTR(err); - - ovs_net = net_generic(net, ovs_net_id); - if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { - vport = ERR_PTR(-EEXIST); - goto error; - } - vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + vport = gre_tnl_create(parms); if (IS_ERR(vport)) - goto error; - - strncpy(vport_priv(vport), parms->name, IFNAMSIZ); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); - return vport; - -error: - gre_exit(); - return vport; -} - -static void gre_tnl_destroy(struct vport *vport) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_net *ovs_net; - - ovs_net = net_generic(net, ovs_net_id); + return vport; - RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); - ovs_vport_deferred_free(vport); - gre_exit(); + return ovs_netdev_link(vport, parms->name); } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_GRE, skb->mark, 0, 0); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_GRE, 0, 0); } static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, - .destroy = gre_tnl_destroy, - .get_name = gre_get_name, - .send = gre_tnl_send, + .send = gre_fb_xmit, .get_egress_tun_info = gre_get_egress_tun_info, + .destroy = ovs_netdev_tunnel_destroy, .owner = THIS_MODULE, }; @@ -317,4 +116,3 @@ module_exit(ovs_gre_tnl_exit); MODULE_DESCRIPTION("OVS: GRE switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-3"); -#endif diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c index f38f9be07..7f216792b 100644 --- a/datapath/vport-internal_dev.c +++ b/datapath/vport-internal_dev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -22,15 +22,16 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> -#include <linux/netdev_features.h> #include <linux/skbuff.h> -#include <linux/version.h> +#include <linux/percpu.h> +#include <linux/u64_stats_sync.h> +#include <linux/netdev_features.h> #include <net/dst.h> #include <net/xfrm.h> +#include <net/rtnetlink.h> #include "datapath.h" -#include "vlan.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -45,42 +46,30 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ -#else -static struct net_device_stats *internal_dev_sys_stats(struct net_device *netdev) -{ - struct net_device_stats *stats = &netdev->stats; -#endif - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. - */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + len = skb->len; rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); + + if (likely(!err)) { +#ifdef HAVE_DEV_TSTATS + struct pcpu_sw_netstats *tstats; + + tstats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); +#endif + } else { + netdev->stats.tx_errors++; + } return 0; } @@ -132,17 +121,32 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } +#ifdef HAVE_DEV_TSTATS +static int internal_dev_init(struct net_device *dev) +{ + dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + return 0; +} + +static void internal_dev_uninit(struct net_device *dev) +{ + free_percpu(dev->tstats); +} +#endif + static const struct net_device_ops internal_dev_netdev_ops = { +#ifdef HAVE_DEV_TSTATS + .ndo_init = internal_dev_init, + .ndo_uninit = internal_dev_uninit, + .ndo_get_stats64 = ip_tunnel_get_stats64, +#endif .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) - .ndo_get_stats64 = internal_dev_get_stats, -#else - .ndo_get_stats = internal_dev_sys_stats, -#endif }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -156,7 +160,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -169,62 +173,56 @@ static void do_setup(struct net_device *netdev) netdev->vlan_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) - netdev->hw_features = netdev->features & ~NETIF_F_LLTX; -#endif - #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) netdev->hw_enc_features = netdev->features; #endif - +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) + netdev->hw_features = netdev->features & ~NETIF_F_LLTX; +#endif eth_hw_addr_random(netdev); } static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) goto error_free_netdev; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; error_free_netdev: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -233,26 +231,27 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); + unregister_netdevice(vport->dev); rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static netdev_tx_t internal_dev_recv(struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; - int len; + struct net_device *netdev = skb->dev; +#ifdef HAVE_DEV_TSTATS + struct pcpu_sw_netstats *stats; +#endif if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); - return 0; + netdev->stats.rx_dropped++; + return NETDEV_TX_OK; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) @@ -260,7 +259,7 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) if (unlikely(!vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)))) - return 0; + return NETDEV_TX_OK; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, @@ -271,27 +270,30 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) } #endif - len = skb->len; - skb_dst_drop(skb); nf_reset(skb); secpath_reset(skb); - skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - netif_rx(skb); +#ifdef HAVE_DEV_TSTATS + stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); +#endif - return len; + netif_rx(skb); + return NETDEV_TX_OK; } static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c index 104a21d66..e6c00facd 100644 --- a/datapath/vport-lisp.c +++ b/datapath/vport-lisp.c @@ -1,332 +1,67 @@ /* - * Copyright (c) 2011 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. + * Copyright (c) 2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/version.h> - #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> -#include <linux/module.h> #include <linux/rculist.h> #include <linux/udp.h> +#include <linux/if_vlan.h> +#include <linux/module.h> +#include <net/lisp.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> #include <net/udp.h> -#include <net/udp_tunnel.h> #include <net/xfrm.h> #include "datapath.h" -#include "gso.h" #include "vport.h" +#include "vport-netdev.h" -/* - * LISP encapsulation header: - * - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |N|L|E|V|I|flags| Nonce/Map-Version | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Instance ID/Locator Status Bits | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - */ - -/** - * struct lisphdr - LISP header - * @nonce_present: Flag indicating the presence of a 24 bit nonce value. - * @locator_status_bits_present: Flag indicating the presence of Locator Status - * Bits (LSB). - * @solicit_echo_nonce: Flag indicating the use of the echo noncing mechanism. - * @map_version_present: Flag indicating the use of mapping versioning. - * @instance_id_present: Flag indicating the presence of a 24 bit Instance ID. - * @reserved_flags: 3 bits reserved for future flags. - * @nonce: 24 bit nonce value. - * @map_version: 24 bit mapping version. - * @locator_status_bits: Locator Status Bits: 32 bits when instance_id_present - * is not set, 8 bits when it is. - * @instance_id: 24 bit Instance ID - */ -struct lisphdr { -#ifdef __LITTLE_ENDIAN_BITFIELD - __u8 reserved_flags:3; - __u8 instance_id_present:1; - __u8 map_version_present:1; - __u8 solicit_echo_nonce:1; - __u8 locator_status_bits_present:1; - __u8 nonce_present:1; -#else - __u8 nonce_present:1; - __u8 locator_status_bits_present:1; - __u8 solicit_echo_nonce:1; - __u8 map_version_present:1; - __u8 instance_id_present:1; - __u8 reserved_flags:3; -#endif - union { - __u8 nonce[3]; - __u8 map_version[3]; - } u1; - union { - __be32 locator_status_bits; - struct { - __u8 instance_id[3]; - __u8 locator_status_bits; - } word2; - } u2; -}; - -#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr)) - +static struct vport_ops ovs_lisp_vport_ops; /** * struct lisp_port - Keeps track of open UDP ports - * @dst_port: lisp UDP port no. - * @list: list element in @lisp_ports. - * @lisp_rcv_socket: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct lisp_port { - __be16 dst_port; - struct list_head list; - struct socket *lisp_rcv_socket; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(lisp_ports); -static struct vport_ops ovs_lisp_vport_ops; - static inline struct lisp_port *lisp_vport(const struct vport *vport) { return vport_priv(vport); } -static struct lisp_port *lisp_find_port(struct net *net, __be16 port) -{ - struct lisp_port *lisp_port; - - list_for_each_entry_rcu(lisp_port, &lisp_ports, list) { - if (lisp_port->dst_port == port && - net_eq(sock_net(lisp_port->lisp_rcv_socket->sk), net)) - return lisp_port; - } - - return NULL; -} - -static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb) -{ - return (struct lisphdr *)(udp_hdr(skb) + 1); -} - -/* Convert 64 bit tunnel ID to 24 bit Instance ID. */ -static void tunnel_id_to_instance_id(__be64 tun_id, __u8 *iid) -{ - -#ifdef __BIG_ENDIAN - iid[0] = (__force __u8)(tun_id >> 16); - iid[1] = (__force __u8)(tun_id >> 8); - iid[2] = (__force __u8)tun_id; -#else - iid[0] = (__force __u8)((__force u64)tun_id >> 40); - iid[1] = (__force __u8)((__force u64)tun_id >> 48); - iid[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit Instance ID to 64 bit tunnel ID. */ -static __be64 instance_id_to_tunnel_id(__u8 *iid) -{ -#ifdef __BIG_ENDIAN - return (iid[0] << 16) | (iid[1] << 8) | iid[2]; -#else - return (__force __be64)(((__force u64)iid[0] << 40) | - ((__force u64)iid[1] << 48) | - ((__force u64)iid[2] << 56)); -#endif -} - -/* Compute source UDP port for outgoing packet. - * Currently we use the flow hash. - */ -static u16 get_src_port(struct net *net, struct sk_buff *skb) -{ - u32 hash = skb_get_hash(skb); - unsigned int range; - int high; - int low; - - if (!hash) { - if (skb->protocol == htons(ETH_P_IP)) { - struct iphdr *iph; - int size = (sizeof(iph->saddr) * 2) / sizeof(u32); - - iph = (struct iphdr *) skb_network_header(skb); - hash = jhash2((const u32 *)&iph->saddr, size, 0); - } else if (skb->protocol == htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6hdr; - - ipv6hdr = (struct ipv6hdr *) skb_network_header(skb); - hash = jhash2((const u32 *)&ipv6hdr->saddr, - (sizeof(struct in6_addr) * 2) / sizeof(u32), 0); - } else { - pr_warn_once("LISP inner protocol is not IP when " - "calculating hash.\n"); - } - } - - inet_get_local_port_range(net, &low, &high); - range = (high - low) + 1; - return (((u64) hash * range) >> 32) + low; -} - -static void lisp_build_header(struct sk_buff *skb) -{ - struct lisphdr *lisph; - const struct ovs_key_ipv4_tunnel *tun_key; - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - lisph = (struct lisphdr *)__skb_push(skb, sizeof(struct lisphdr)); - lisph->nonce_present = 0; /* We don't support echo nonce algorithm */ - lisph->locator_status_bits_present = 1; /* Set LSB */ - lisph->solicit_echo_nonce = 0; /* No echo noncing */ - lisph->map_version_present = 0; /* No mapping versioning, nonce instead */ - lisph->instance_id_present = 1; /* Store the tun_id as Instance ID */ - lisph->reserved_flags = 0; /* Reserved flags, set to 0 */ - - lisph->u1.nonce[0] = 0; - lisph->u1.nonce[1] = 0; - lisph->u1.nonce[2] = 0; - - tunnel_id_to_instance_id(tun_key->tun_id, &lisph->u2.word2.instance_id[0]); - lisph->u2.word2.locator_status_bits = 1; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int lisp_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct lisp_port *lisp_port; - struct lisphdr *lisph; - struct iphdr *iph, *inner_iph; - struct ovs_tunnel_info tun_info; - __be64 key; - struct ethhdr *ethh; - __be16 protocol; - - lisp_port = rcu_dereference_sk_user_data(sk); - if (unlikely(!lisp_port)) - goto error; - - if (iptunnel_pull_header(skb, LISP_HLEN, 0)) - goto error; - - lisph = lisp_hdr(skb); - - if (lisph->instance_id_present != 1) - key = 0; - else - key = instance_id_to_tunnel_id(&lisph->u2.word2.instance_id[0]); - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, TUNNEL_KEY, NULL, 0); - - /* Drop non-IP inner packets */ - inner_iph = (struct iphdr *)(lisph + 1); - switch (inner_iph->version) { - case 4: - protocol = htons(ETH_P_IP); - break; - case 6: - protocol = htons(ETH_P_IPV6); - break; - default: - goto error; - } - skb->protocol = protocol; - - /* Add Ethernet header */ - ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN); - memset(ethh, 0, ETH_HLEN); - ethh->h_dest[0] = 0x02; - ethh->h_source[0] = 0x02; - ethh->h_proto = protocol; - - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - - ovs_vport_receive(vport_from_priv(lisp_port), skb, &tun_info); - goto out; - -error: - kfree_skb(skb); -out: - return 0; -} - -static int lisp_socket_init(struct lisp_port *lisp_port, struct net *net) -{ - struct udp_port_cfg udp_conf; - struct udp_tunnel_sock_cfg tunnel_cfg; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - udp_conf.local_udp_port = lisp_port->dst_port; - - err = udp_sock_create(net, &udp_conf, &lisp_port->lisp_rcv_socket); - if (err < 0) { - pr_warn("cannot register lisp protocol handler: %d\n", err); - return err; - } - - tunnel_cfg.sk_user_data = lisp_port; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = lisp_rcv; - tunnel_cfg.encap_destroy = NULL; - - setup_udp_tunnel_sock(net, lisp_port->lisp_rcv_socket, &tunnel_cfg); - - return 0; -} - -static int lisp_get_options(const struct vport *vport, struct sk_buff *skb) +static int lisp_get_options(const struct vport *vport, + struct sk_buff *skb) { struct lisp_port *lisp_port = lisp_vport(vport); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(lisp_port->dst_port))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, lisp_port->port_no)) return -EMSGSIZE; return 0; } -static void lisp_tnl_destroy(struct vport *vport) +static int lisp_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct dp_upcall_info *upcall) { struct lisp_port *lisp_port = lisp_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = htons(lisp_port->port_no); + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - list_del_rcu(&lisp_port->list); - udp_tunnel_sock_release(lisp_port->lisp_rcv_socket); - ovs_vport_deferred_free(vport); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_UDP, sport, dport); } static struct vport *lisp_tnl_create(const struct vport_parms *parms) @@ -334,10 +69,11 @@ static struct vport *lisp_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct lisp_port *lisp_port; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -353,158 +89,48 @@ static struct vport *lisp_tnl_create(const struct vport_parms *parms) goto error; } - /* Verify if we already have a socket created for this port */ - if (lisp_find_port(net, htons(dst_port))) { - err = -EEXIST; - goto error; - } - vport = ovs_vport_alloc(sizeof(struct lisp_port), &ovs_lisp_vport_ops, parms); if (IS_ERR(vport)) return vport; lisp_port = lisp_vport(vport); - lisp_port->dst_port = htons(dst_port); - strncpy(lisp_port->name, parms->name, IFNAMSIZ); - - err = lisp_socket_init(lisp_port, net); - if (err) - goto error_free; + lisp_port->port_no = dst_port; + + rtnl_lock(); + dev = lisp_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } - list_add_tail_rcu(&lisp_port->list, &lisp_ports); + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; - -error_free: - ovs_vport_free(vport); error: return ERR_PTR(err); } -static int lisp_send(struct vport *vport, struct sk_buff *skb) -{ - struct ovs_key_ipv4_tunnel *tun_key; - struct lisp_port *lisp_port = lisp_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - int network_offset = skb_network_offset(skb); - struct rtable *rt; - int min_headroom; - __be32 saddr; - __be16 src_port, dst_port; - __be16 df; - int sent_len; - int err; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - if (skb->protocol != htons(ETH_P_IP) && - skb->protocol != htons(ETH_P_IPV6)) { - err = 0; - goto error; - } - - /* Route lookup */ - saddr = tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, tun_key->ipv4_dst, - IPPROTO_UDP, tun_key->ipv4_tos, - skb->mark); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len - + sizeof(struct iphdr) + LISP_HLEN; - - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - /* Reset l2 headers. */ - skb_pull(skb, network_offset); - skb_reset_mac_header(skb); - vlan_set_tci(skb, 0); - - skb = udp_tunnel_handle_offloads(skb, false, false); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; - } - - src_port = htons(get_src_port(net, skb)); - dst_port = lisp_port->dst_port; - - lisp_build_header(skb); - - skb->ignore_df = 1; - - ovs_skb_set_inner_protocol(skb, skb->protocol); - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sent_len = udp_tunnel_xmit_skb(rt, lisp_port->lisp_rcv_socket->sk, skb, - saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, - df, src_port, dst_port, false, true); - - return sent_len > 0 ? sent_len + network_offset : sent_len; - -err_free_rt: - ip_rt_put(rt); -error: - kfree_skb(skb); - return err; -} - -static const char *lisp_get_name(const struct vport *vport) +static struct vport *lisp_create(const struct vport_parms *parms) { - struct lisp_port *lisp_port = lisp_vport(vport); - return lisp_port->name; -} - -static int lisp_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct lisp_port *lisp_port = lisp_vport(vport); + struct vport *vport; - if (skb->protocol != htons(ETH_P_IP) && - skb->protocol != htons(ETH_P_IPV6)) { - return -EINVAL; - } + vport = lisp_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* - * Get tp_src and tp_dst, refert to lisp_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - htons(get_src_port(net, skb)), - lisp_port->dst_port); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_lisp_vport_ops = { - .type = OVS_VPORT_TYPE_LISP, - .create = lisp_tnl_create, - .destroy = lisp_tnl_destroy, - .get_name = lisp_get_name, - .get_options = lisp_get_options, - .send = lisp_send, + .type = OVS_VPORT_TYPE_LISP, + .create = lisp_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = lisp_get_options, + .send = lisp_xmit, + .owner = THIS_MODULE, .get_egress_tun_info = lisp_get_egress_tun_info, - .owner = THIS_MODULE, }; static int __init ovs_lisp_tnl_init(void) @@ -520,6 +146,6 @@ static void __exit ovs_lisp_tnl_exit(void) module_init(ovs_lisp_tnl_init); module_exit(ovs_lisp_tnl_exit); -MODULE_DESCRIPTION("OVS: LISP switching port"); +MODULE_DESCRIPTION("OVS: Lisp switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-105"); diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c index 6c8373740..21431d3c3 100644 --- a/datapath/vport-netdev.c +++ b/datapath/vport-netdev.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -26,32 +26,61 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> -#include <linux/netdevice.h> -#include <net/llc.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> #include "datapath.h" -#include "vlan.h" +#include "gso.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb); + +/* Must be called with rcu_read_lock. */ +void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info) +{ + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); + if (unlikely(!vport)) + goto error; + + if (unlikely(skb_warn_if_lro(skb))) + goto error; + + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return; + + skb_push(skb, ETH_HLEN); + ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + ovs_vport_receive(vport, skb, tun_info); + return; +error: + kfree_skb(skb); +} + +#ifndef HAVE_METADATA_DST +#define port_receive(skb) netdev_port_receive(skb, NULL) +#else +#define port_receive(skb) netdev_port_receive(skb, skb_tunnel_info(skb)) +#endif #if defined HAVE_RX_HANDLER_PSKB /* 2.6.39 and above or backports */ /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + port_receive(skb); return RX_HANDLER_CONSUMED; } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) || \ @@ -59,15 +88,10 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) /* Called with rcu_read_lock and bottom-halves disabled. */ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb) { - struct vport *vport; - if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return skb; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + port_receive(skb); return NULL; } #elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) @@ -79,7 +103,7 @@ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb) static struct sk_buff *netdev_frame_hook(struct net_bridge_port *p, struct sk_buff *skb) { - netdev_port_receive((struct vport *)p, skb); + port_receive(skb); return NULL; } #else @@ -92,167 +116,112 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +struct vport *ovs_netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_netdev_link); + +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} -static void free_port_rcu(struct rcu_head *rcu) +static void vport_netdev_free(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + if (vport->dev) + dev_put(vport->dev); + ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } +EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (ovs_netdev_get_vport(netdev_vport->dev)) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); -} - -const char *ovs_netdev_get_name(const struct vport *vport) -{ - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; -} - -/* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) -{ - if (unlikely(!vport)) - goto error; - - if (unlikely(skb_warn_if_lro(skb))) - goto error; - - /* Make our own copy of the packet. Otherwise we will mangle the - * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). - * (No one comes after us, since we tell handle_bridge() that we took - * the packet.) - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb)) - return; - - skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - - ovs_vport_receive(vport, skb, NULL); - return; - -error: - kfree_skb(skb); + call_rcu(&vport->rcu, vport_netdev_free); } -static unsigned int packet_length(const struct sk_buff *skb) +void ovs_netdev_tunnel_destroy(struct vport *vport) { - unsigned int length = skb->len - ETH_HLEN; - - if (skb->protocol == htons(ETH_P_8021Q)) - length -= VLAN_HLEN; - - return length; -} - -static int netdev_send(struct vport *vport, struct sk_buff *skb) -{ - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; - int len; - - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, - packet_length(skb), mtu); - goto drop; - } - - skb->dev = netdev_vport->dev; - len = skb->len; - dev_queue_xmit(skb); + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); - return len; + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); -drop: - kfree_skb(skb); - return 0; + call_rcu(&vport->rcu, vport_netdev_free); } +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) @@ -285,8 +254,7 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, - .send = netdev_send, + .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) diff --git a/datapath/vport-netdev.h b/datapath/vport-netdev.h index 6f7038e79..f8fbb8689 100644 --- a/datapath/vport-netdev.h +++ b/datapath/vport-netdev.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -26,22 +26,15 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -const char *ovs_netdev_get_name(const struct vport *); +struct vport *ovs_netdev_link(struct vport *vport, const char *name); +void ovs_netdev_send(struct vport *vport, struct sk_buff *skb); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); void ovs_netdev_exit(void); +void ovs_netdev_tunnel_destroy(struct vport *vport); + +void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info); + #endif /* vport_netdev.h */ diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c index 4eb0282fa..9e2079a16 100644 --- a/datapath/vport-stt.c +++ b/datapath/vport-stt.c @@ -9,34 +9,34 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> -#include <linux/module.h> #include <linux/net.h> #include <linux/rculist.h> #include <linux/udp.h> +#include <linux/if_vlan.h> +#include <linux/module.h> +#include <net/stt.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> -#include <net/stt.h> #include <net/udp.h> +#include <net/xfrm.h> +#include <net/stt.h> #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" #ifdef OVS_STT static struct vport_ops ovs_stt_vport_ops; - /** - * struct stt_port - * @stt_sock: The socket created for this port number. - * @name: vport name. + * struct stt_port - Keeps track of open UDP ports + * @dst_port: destination port. */ struct stt_port { - struct stt_sock *stt_sock; - char name[IFNAMSIZ]; + u16 port_no; }; static inline struct stt_port *stt_vport(const struct vport *vport) @@ -44,42 +44,26 @@ static inline struct stt_port *stt_vport(const struct vport *vport) return vport_priv(vport); } -static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) -{ - struct vport *vport = stt_sock->rcv_data; - struct stthdr *stth = stt_hdr(skb); - struct ovs_tunnel_info tun_info; - struct sk_buff *next; - - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - tcp_hdr(skb)->source, tcp_hdr(skb)->dest, - get_unaligned(&stth->key), - TUNNEL_KEY | TUNNEL_CSUM, - NULL, 0); - do { - next = skb->next; - skb->next = NULL; - ovs_vport_receive(vport, skb, &tun_info); - } while ((skb = next)); -} - -static int stt_tnl_get_options(const struct vport *vport, - struct sk_buff *skb) +static int stt_get_options(const struct vport *vport, + struct sk_buff *skb) { struct stt_port *stt_port = stt_vport(vport); - struct inet_sock *sk = inet_sk(stt_port->stt_sock->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, stt_port->port_no)) return -EMSGSIZE; return 0; } -static void stt_tnl_destroy(struct vport *vport) +static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct dp_upcall_info *upcall) { struct stt_port *stt_port = stt_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = htons(stt_port->port_no); + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - stt_sock_release(stt_port->stt_sock); - ovs_vport_deferred_free(vport); + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), + skb, IPPROTO_UDP, sport, dport); } static struct vport *stt_tnl_create(const struct vport_parms *parms) @@ -87,11 +71,11 @@ static struct vport *stt_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct stt_port *stt_port; - struct stt_sock *stt_sock; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -113,113 +97,52 @@ static struct vport *stt_tnl_create(const struct vport_parms *parms) return vport; stt_port = stt_vport(vport); - strncpy(stt_port->name, parms->name, IFNAMSIZ); + stt_port->port_no = dst_port; - stt_sock = stt_sock_add(net, htons(dst_port), stt_rcv, vport); - if (IS_ERR(stt_sock)) { + rtnl_lock(); + dev = stt_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return ERR_CAST(stt_sock); + return ERR_CAST(dev); } - stt_port->stt_sock = stt_sock; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int stt_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct stt_port *stt_port = stt_vport(vport); - __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; - const struct ovs_tunnel_info *tun_info; - struct rtable *rt; - __be16 sport; - __be32 saddr; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->tunnel; - /* Route lookup */ - saddr = tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, tun_key->ipv4_dst, - IPPROTO_TCP, tun_key->ipv4_tos, - skb->mark); - - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - skb->ignore_df = 1; - - return stt_xmit_skb(skb, rt, saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, - df, sport, dport, tun_key->tun_id); -error: - kfree_skb(skb); - return err; -} - -static const char *stt_tnl_get_name(const struct vport *vport) +static struct vport *stt_create(const struct vport_parms *parms) { - return stt_vport(vport)->name; -} + struct vport *vport; -static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) -{ - struct stt_port *stt_port = stt_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = stt_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to stt_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_TCP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_stt_vport_ops = { - .type = OVS_VPORT_TYPE_STT, - .create = stt_tnl_create, - .destroy = stt_tnl_destroy, - .get_name = stt_tnl_get_name, - .get_options = stt_tnl_get_options, - .send = stt_tnl_send, + .type = OVS_VPORT_TYPE_STT, + .create = stt_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = stt_get_options, + .send = ovs_stt_xmit, + .owner = THIS_MODULE, .get_egress_tun_info = stt_get_egress_tun_info, - .owner = THIS_MODULE, }; static int __init ovs_stt_tnl_init(void) { - int err; - - err = stt_init_module(); - if (err) - return err; - err = ovs_vport_ops_register(&ovs_stt_vport_ops); - if (err) - stt_cleanup_module(); - return err; + return ovs_vport_ops_register(&ovs_stt_vport_ops); } static void __exit ovs_stt_tnl_exit(void) { ovs_vport_ops_unregister(&ovs_stt_vport_ops); - stt_cleanup_module(); } module_init(ovs_stt_tnl_init); diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c index fc9f350d2..66b79f4db 100644 --- a/datapath/vport-vxlan.c +++ b/datapath/vport-vxlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2015 Nicira, Inc. * Copyright (c) 2013 Cisco Systems, Inc. * * This program is free software; you can redistribute it and/or @@ -17,95 +17,37 @@ * 02110-1301, USA */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/version.h> - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> #include <linux/module.h> - -#include <net/icmp.h> -#include <net/ip.h> #include <net/udp.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> #include <net/vxlan.h> #include "datapath.h" #include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in <net/vxlan.h> */ -}; +#include "vport-netdev.h" -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ovs_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} +static struct vport_ops ovs_vxlan_netdev_vport_ops; static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) { - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk); + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) return -EMSGSIZE; - if (vxlan_port->exts) { + if (vxlan->flags & VXLAN_F_GBP) { struct nlattr *exts; exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); if (!exts) return -EMSGSIZE; - if (vxlan_port->exts & VXLAN_F_GBP && + if (vxlan->flags & VXLAN_F_GBP && nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) return -EMSGSIZE; @@ -115,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) return 0; } -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, }; -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) { - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; int err; if (nla_len(attr) < sizeof(struct nlattr)) @@ -141,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) if (err < 0) return err; - vxlan_port = vxlan_vport(vport); - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; + conf->flags |= VXLAN_F_GBP; return 0; } @@ -153,168 +84,103 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) { struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - u16 dst_port; int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA, + }; if (!options) { err = -EINVAL; goto error; } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); + conf.dst_port = htons(nla_get_u16(a)); } else { /* Require destination port from userspace. */ err = -EINVAL; goto error; } - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); if (a) { - err = vxlan_configure_exts(vport, a); + err = vxlan_configure_exts(vport, a, &conf); if (err) { ovs_vport_free(vport); goto error; } } - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)vs; + return ERR_CAST(dev); } - vxlan_port->vs = vs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; - error: return ERR_PTR(err); } -static int vxlan_ext_gbp(struct sk_buff *skb) -{ - const struct ovs_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +static struct vport *vxlan_create(const struct vport_parms *parms) { - struct ovs_key_ipv4_tunnel *tun_key; - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk); - struct vxlan_metadata md = {0}; - struct rtable *rt; - __be16 src_port; - __be32 saddr; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; - - /* Route lookup */ - saddr = tun_key->ipv4_src; - rt = find_route(ovs_dp_get_net(vport->dp), - &saddr, tun_key->ipv4_dst, - IPPROTO_UDP, tun_key->ipv4_tos, - skb->mark); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } + struct vport *vport; - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->ignore_df = 1; + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, vxlan_port->vs->sock->sk, skb, - saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; + return ovs_netdev_link(vport, parms->name); } static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { + struct vxlan_dev *vxlan = netdev_priv(vport->dev); struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk); + __be16 dst_port = vxlan_dev_dst_port(vxlan); __be16 src_port; + int port_min; + int port_max; + inet_get_local_port_range(net, &port_min, &port_max); src_port = udp_flow_src_port(net, skb, 0, 0, true); - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, + return ovs_tunnel_get_egress_info(upcall, net, + skb, IPPROTO_UDP, src_port, dst_port); } -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; -} - -static struct vport_ops ovs_vxlan_vport_ops = { +static struct vport_ops ovs_vxlan_netdev_vport_ops = { .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, + .create = vxlan_create, + .destroy = ovs_netdev_tunnel_destroy, .get_options = vxlan_get_options, - .send = vxlan_tnl_send, + .send = vxlan_xmit, .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, }; static int __init ovs_vxlan_tnl_init(void) { - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); } static void __exit ovs_vxlan_tnl_exit(void) { - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); } module_init(ovs_vxlan_tnl_init); diff --git a/datapath/vport-vxlan.h b/datapath/vport-vxlan.h deleted file mode 100644 index 4b08233e7..000000000 --- a/datapath/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include <linux/kernel.h> -#include <linux/types.h> - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif diff --git a/datapath/vport.c b/datapath/vport.c index 024491f0f..1e22c6d48 100644 --- a/datapath/vport.c +++ b/datapath/vport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -20,26 +20,27 @@ #include <linux/if.h> #include <linux/if_vlan.h> #include <linux/jhash.h> -#include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/list.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/compat.h> -#include <linux/version.h> +#include <linux/module.h> +#include <linux/if_link.h> #include <net/net_namespace.h> +#include <net/lisp.h> +#include <net/gre.h> +#include <net/geneve.h> +#include <net/vxlan.h> +#include <net/stt.h> #include "datapath.h" #include "gso.h" #include "vport.h" #include "vport-internal_dev.h" -static void ovs_vport_record_error(struct vport *, - enum vport_err_type err_type); - static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -53,12 +54,42 @@ static struct hlist_head *dev_table; */ int ovs_vport_init(void) { + int err; + dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; + err = lisp_init_module(); + if (err) + goto err_lisp; + err = ipgre_init(); + if (err) + goto err_gre; + err = geneve_init_module(); + if (err) + goto err_geneve; + + err = vxlan_init_module(); + if (err) + goto err_vxlan; + err = ovs_stt_init_module(); + if (err) + goto err_stt; return 0; + +err_stt: + vxlan_cleanup_module(); +err_vxlan: + geneve_cleanup_module(); +err_geneve: + ipgre_fini(); +err_gre: + lisp_cleanup_module(); +err_lisp: + kfree(dev_table); + return err; } /** @@ -68,6 +99,11 @@ int ovs_vport_init(void) */ void ovs_vport_exit(void) { + ovs_stt_cleanup_module(); + vxlan_cleanup_module(); + geneve_cleanup_module(); + ipgre_fini(); + lisp_cleanup_module(); kfree(dev_table); } @@ -84,8 +120,8 @@ int ovs_vport_ops_register(struct vport_ops *ops) ovs_lock(); list_for_each_entry(o, &vport_ops_list, list) - if (ops->type == o->type) - goto errout; + if (ops->type == o->type) + goto errout; list_add_tail(&ops->list, &vport_ops_list); err = 0; @@ -116,7 +152,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -132,10 +168,10 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using * vport_priv(). vports that are no longer needed should be released with - * ovs_vport_free(). + * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, - const struct vport_parms *parms) + const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; @@ -160,45 +196,41 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-EINVAL); } - vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); - } - return vport; } EXPORT_SYMBOL_GPL(ovs_vport_alloc); -static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) -{ - struct vport_ops *ops; - - list_for_each_entry(ops, &vport_ops_list, list) - if (ops->type == parms->type) - return ops; - - return NULL; -} - /** * ovs_vport_free - uninitialize and free vport * * @vport: vport to free * - * Frees a vport allocated with ovs_vport_alloc() when it is no longer needed. + * Frees a vport allocated with vport_alloc() when it is no longer needed. * * The caller must ensure that an RCU grace period has passed since the last * time @vport was in a datapath. */ void ovs_vport_free(struct vport *vport) { + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ kfree(rcu_dereference_raw(vport->upcall_portids)); - free_percpu(vport->percpu_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} + /** * ovs_vport_add - add vport device (for kernel callers) * @@ -226,7 +258,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } @@ -290,45 +322,19 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { - int i; - - /* We potentially have two surces of stats that need to be - * combined: those we have collected (split into err_stats and - * percpu_stats), and device error stats from netdev->get_stats() - * (for errors that happen downstream and therefore aren't - * reported through our vport_record_error() function). - * Stats from first source are reported by ovs over - * OVS_VPORT_ATTR_STATS. - * netdev-stats can be directly read over netlink-ioctl. - */ - - stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); - stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); - stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); - stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); - - stats->rx_bytes = 0; - stats->rx_packets = 0; - stats->tx_bytes = 0; - stats->tx_packets = 0; - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; } /** @@ -399,7 +405,7 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) old = ovsl_dereference(vport->upcall_portids); - vport_portids = kmalloc(sizeof *vport_portids + nla_len(ids), + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), GFP_KERNEL); if (!vport_portids) return -ENOMEM; @@ -412,7 +418,6 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) if (old) call_rcu(&old->rcu, vport_portids_destroy_rcu_cb); - return 0; } @@ -439,7 +444,7 @@ int ovs_vport_get_upcall_portids(const struct vport *vport, if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, - ids->n_ids * sizeof(u32), (void *) ids->ids); + ids->n_ids * sizeof(u32), (void *)ids->ids); else return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } @@ -458,6 +463,7 @@ int ovs_vport_get_upcall_portids(const struct vport *vport, u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) { struct vport_portids *ids; + u32 ids_index; u32 hash; ids = rcu_dereference(vport->upcall_portids); @@ -466,7 +472,8 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) return 0; hash = skb_get_hash(skb); - return ids->ids[hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids)]; + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; } /** @@ -474,99 +481,31 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * * @vport: vport that received the packet * @skb: skb that was received - * @tun_info: tunnel (if any) that carried packet + * @tun_key: tunnel (if any) that carried packet * * Must be called with rcu_read_lock. The packet cannot be shared and - * skb->data should point to the Ethernet header. The caller must have already - * called compute_ip_summed() to initialize the checksumming fields. + * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) { - struct pcpu_sw_netstats *stats; struct sw_flow_key key; int error; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - u64_stats_update_end(&stats->syncp); - - ovs_skb_init_inner_protocol(skb); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_info = NULL; + ovs_skb_init_inner_protocol(skb); + skb_clear_ovs_gso_cb(skb); + /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } ovs_dp_process_packet(skb, &key); + return 0; } EXPORT_SYMBOL_GPL(ovs_vport_receive); -/** - * ovs_vport_send - send a packet on a device - * - * @vport: vport on which to send the packet - * @skb: skb to send - * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. - */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) -{ - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_sw_netstats *stats; - - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - } else { - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - } - return sent; -} - -/** - * ovs_vport_record_error - indicate device error to generic stats layer - * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type - * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. - */ -static void ovs_vport_record_error(struct vport *vport, - enum vport_err_type err_type) -{ - switch (err_type) { - case VPORT_E_RX_DROPPED: - atomic_long_inc(&vport->err_stats.rx_dropped); - break; - - case VPORT_E_RX_ERROR: - atomic_long_inc(&vport->err_stats.rx_errors); - break; - - case VPORT_E_TX_DROPPED: - atomic_long_inc(&vport->err_stats.tx_dropped); - break; - - case VPORT_E_TX_ERROR: - atomic_long_inc(&vport->err_stats.tx_errors); - break; - } - -} - static void free_vport_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -583,33 +522,32 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, - const struct ovs_tunnel_info *tun_info, + struct sk_buff *skb, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst) { - const struct ovs_key_ipv4_tunnel *tun_key; + struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info; + struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); + const struct ip_tunnel_key *tun_key; + u32 skb_mark = skb->mark; struct rtable *rt; - __be32 saddr; + struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; + if (ip_tunnel_info_af(tun_info) != AF_INET) + return -EINVAL; + + tun_key = &tun_info->key; - tun_key = &tun_info->tunnel; - saddr = tun_key->ipv4_src; - /* Route lookup to get srouce IP address: saddr. + /* Route lookup to get srouce IP address. * The process may need to be changed if the corresponding process * in vports ops changed. */ - rt = find_route(net, - &saddr, - tun_key->ipv4_dst, - ipproto, - tun_key->ipv4_tos, - skb_mark); + rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -618,26 +556,56 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ovs_flow_tun_info_init(egress_tun_info, - saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); - + ip_tunnel_key_init(&egress_tun_info->key, + fl.saddr, tun_key->u.ipv4.dst, + tun_key->tos, + tun_key->ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags); + egress_tun_info->options_len = tun_info->options_len; + egress_tun_info->mode = tun_info->mode; + upcall->egress_tun_opts = ip_tunnel_info_opts(tun_info); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) + struct dp_upcall_info *upcall) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) return -EINVAL; - return vport->ops->get_egress_tun_info(vport, skb, info); + return vport->ops->get_egress_tun_info(vport, skb, upcall); +} + +static unsigned int packet_length(const struct sk_buff *skb) +{ + unsigned int length = skb->len - ETH_HLEN; + + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + + return length; +} + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + int mtu = vport->dev->mtu; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + vport->dev->name, + packet_length(skb), mtu); + vport->dev->stats.tx_errors++; + goto drop; + } + + skb->dev = vport->dev; + vport->ops->send(skb); + return; + +drop: + kfree_skb(skb); } diff --git a/datapath/vport.h b/datapath/vport.h index b217b8862..d82071970 100644 --- a/datapath/vport.h +++ b/datapath/vport.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2015 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -27,14 +27,14 @@ #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> +#include <net/route.h> + +#include "datapath.h" struct vport; struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; int ovs_vport_init(void); void ovs_vport_exit(void); @@ -53,25 +53,15 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); - -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, - const struct ovs_tunnel_info *tun_info, + struct sk_buff *, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst); + int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); - -/* The following definitions are for implementers of vport devices: */ -struct vport_err_stats { - atomic_long_t rx_dropped; - atomic_long_t rx_errors; - atomic_long_t tx_dropped; - atomic_long_t tx_errors; -}; + struct dp_upcall_info *upcall); /** * struct vport_portids - array of netlink portids of a vport. @@ -98,12 +88,10 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @err_stats: Points to error statistics used and maintained by vport * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -112,10 +100,8 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_sw_netstats __percpu *percpu_stats; - - struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** @@ -152,8 +138,7 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for * a packet. @@ -168,24 +153,14 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - - int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); + struct dp_upcall_info *upcall); + netdev_tx_t (*send)(struct sk_buff *skb); struct module *owner; struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, -}; - struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); @@ -222,8 +197,8 @@ static inline struct vport *vport_from_priv(void *priv) return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -232,6 +207,32 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev->name; +} + int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); + +static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, + const struct ip_tunnel_key *key, + u32 mark, + struct flowi4 *fl, + u8 protocol) +{ + struct rtable *rt; + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); + fl->flowi4_mark = mark; + fl->flowi4_proto = protocol; + + rt = ip_route_output_key(net, fl); + return rt; +} + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb); #endif /* vport.h */ |